diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 02d15b7f3..b88e3671b 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -121,5 +121,11 @@ In chronological order: * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1). ARMv8 support. +* Dan Kortschak + * [2015-01-07] Added test for drotmg bug #484. + +* Ton van den Heuvel + * [2015-03-18] Fix race condition during shutdown causing a crash in gotoblas_set_affinity(). + * [Your name or handle] <[email or website]> * [Date] [Brief summary of your changes] diff --git a/Changelog.txt b/Changelog.txt index b11321f71..6941a9f96 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,24 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.14 +24-Mar-2015 +common: + * Improve OpenBLASConfig.cmake. (#474, #475. Thanks, xantares.) + * Improve ger and gemv for small matrices by stack allocation. + e.g. make -DMAX_STACK_ALLOC=2048 (#482. Thanks, Jerome Robert.) + * Introduce openblas_get_num_threads and openblas_get_num_procs. + (#497. Thanks, Erik Schnetter.) + * Add ATLAS-style ?geadd function. (#509. Thanks, Martin Köhler.) + * Fix c/zsyr bug with negative incx. (#492.) + * Fix race condition during shutdown causing a crash in + gotoblas_set_affinity(). (#508. Thanks, Ton van den Heuvel.) + +x86/x86-64: + * Support AMD Streamroller. + +ARM: + * Add Cortex-A9 and Cortex-A15 targets. + ==================================================================== Version 0.2.13 3-Dec-2014 diff --git a/GotoBLAS_05LargePage.txt b/GotoBLAS_05LargePage.txt index ec5106fcd..c3e171a88 100644 --- a/GotoBLAS_05LargePage.txt +++ b/GotoBLAS_05LargePage.txt @@ -9,10 +9,10 @@ If you want to allocate 64 large pages, - $shell> echo 0 > /pros/sys/vm/nr_hugepages # need to be reset - $shell> echo 65 > /pros/sys/vm/nr_hugepages # add 1 extra page - $shell> echo 3355443200 > /pros/sys/kernel/shmmax # just large number - $shell> echo 3355443200 > /pros/sys/kernel/shmall + $shell> echo 0 > /proc/sys/vm/nr_hugepages # need to be reset + $shell> echo 65 > /proc/sys/vm/nr_hugepages # add 1 extra page + $shell> echo 3355443200 > /proc/sys/kernel/shmmax # just large number + $shell> echo 3355443200 > /proc/sys/kernel/shmall Also may add a few lines into /etc/security/limits.conf file. diff --git a/Makefile.arm b/Makefile.arm index 5bdd4d151..9978a672a 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -1,3 +1,8 @@ +# ifeq logical or +ifeq ($(CORE), $(filter $(CORE),CORTEXA9 CORTEXA15)) +CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +FCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a +endif ifeq ($(CORE), ARMV7) CCOMMON_OPT += -marm -mfpu=vfpv3 -mfloat-abi=hard -march=armv7-a diff --git a/Makefile.install b/Makefile.install index 04323eef5..e1deaae3e 100644 --- a/Makefile.install +++ b/Makefile.install @@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) -OPENBLAS_CMAKE_DIR := $(PREFIX)/cmake +OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake .PHONY : install @@ -46,11 +46,11 @@ ifndef NO_CBLAS endif ifndef NO_LAPACKE - @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_LIBRARY_DIR) - @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h - @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h - @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h - @-install -pDm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h + @echo Copying LAPACKE header files to $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) + @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_config.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_config.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_mangling_with_flags.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_mangling.h + @-install -pm644 $(NETLIB_LAPACK_DIR)/lapacke/include/lapacke_utils.h $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)/lapacke_utils.h endif #for install static library @@ -95,7 +95,8 @@ endif endif #Generating OpenBLASConfig.cmake @echo Generating $(OPENBLAS_CMAKE_CONFIG) in $(DESTDIR)$(OPENBLAS_CMAKE_DIR) - @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_VERSION \"${VERSION}\")" > $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) + @echo "SET(OpenBLAS_INCLUDE_DIRS ${OPENBLAS_INCLUDE_DIR})" >> $(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG) ifndef NO_SHARED #ifeq logical or ifeq ($(OSNAME), $(filter $(OSNAME),Linux FreeBSD NetBSD)) diff --git a/Makefile.rule b/Makefile.rule index d3a2d1fa3..1479de660 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.13 +VERSION = 0.2.14 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -159,6 +159,19 @@ COMMON_PROF = -pg # Build Debug version # DEBUG = 1 +# Improve GEMV and GER for small matrices by stack allocation. +# For details, https://github.com/xianyi/OpenBLAS/pull/482 +# +# MAX_STACK_ALLOC=2048 + +# Add a prefix or suffix to all exported symbol names in the shared library. +# Avoid conflicts with other BLAS libraries, especially when using +# 64 bit integer interfaces in OpenBLAS. +# For details, https://github.com/xianyi/OpenBLAS/pull/459 +# +# SYMBOLPREFIX= +# SYMBOLSUFFIX= + # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index ec6339d62..525daa41b 100644 --- a/Makefile.system +++ b/Makefile.system @@ -61,6 +61,9 @@ endif ifeq ($(TARGET), PILEDRIVER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET), STEAMROLLER) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -85,6 +88,9 @@ endif ifeq ($(TARGET_CORE), PILEDRIVER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET_CORE), STEAMROLLER) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -305,6 +311,10 @@ ifdef SANITY_CHECK CCOMMON_OPT += -DSANITY_CHECK -DREFNAME=$(*F)f$(BU) endif +ifdef MAX_STACK_ALLOC +CCOMMON_OPT += -DMAX_STACK_ALLOC=$(MAX_STACK_ALLOC) +endif + # # Architecture dependent settings # @@ -354,6 +364,12 @@ endif ifeq ($(USE_OPENMP), 1) + +#check +ifeq ($(USE_THREAD), 0) +$(error OpenBLAS: Cannot set both USE_OPENMP=1 and USE_THREAD=0. The USE_THREAD=0 is only for building single thread version.) +endif + # ifeq logical or. GCC or LSB ifeq ($(C_COMPILER), $(filter $(C_COMPILER),GCC LSB)) CCOMMON_OPT += -fopenmp @@ -392,7 +408,7 @@ endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER endif ifneq ($(NO_AVX2), 1) DYNAMIC_CORE += HASWELL diff --git a/README.md b/README.md index f4c547701..cdacf9888 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ Please read GotoBLAS_01Readme.txt - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. +- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. #### MIPS64: - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. diff --git a/TargetList.txt b/TargetList.txt index 97661fdcf..1c985080b 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -32,6 +32,7 @@ ISTANBUL BOBCAT BULLDOZER PILEDRIVER +STEAMROLLER c)VIA CPU: SSE_GENERIC @@ -62,6 +63,11 @@ SPARC SPARCV7 6.ARM CPU: +CORTEXA15 +CORTEXA9 ARMV7 ARMV6 ARMV5 + +7.ARM 64-bit CPU: +ARMV8 diff --git a/benchmark/Makefile b/benchmark/Makefile index cf219cef1..b5eaa9343 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -6,8 +6,13 @@ include $(TOPDIR)/Makefile.system #LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm # ACML custom -ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib -LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm +#ACML=/opt/pb/acml-5-3-1-gfortran-64bit/gfortran64_fma4_mp/lib +#LIBACML = -fopenmp $(ACML)/libacml_mp.a -lgfortran -lm + +# ACML 6.1 custom +ACML=/home/werner/project/acml6.1/gfortran64_mp/lib +LIBACML = -fopenmp $(ACML)/libacml_mp.so -lgfortran -lm + # Atlas Ubuntu #ATLAS=/usr/lib/atlas-base diff --git a/benchmark/axpy.c b/benchmark/axpy.c index ef3b5ae4f..a7206b690 100644 --- a/benchmark/axpy.c +++ b/benchmark/axpy.c @@ -114,7 +114,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT alpha[2] = { 2.0, 2.0 }; @@ -198,4 +198,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c index 76c368eda..c8b96d80f 100644 --- a/benchmark/cholesky.c +++ b/benchmark/cholesky.c @@ -117,7 +117,7 @@ static __inline double getmflops(int ratio, int m, double secs){ } -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ #ifndef COMPLEX char *trans[] = {"T", "N"}; @@ -273,4 +273,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/dot.c b/benchmark/dot.c index 6132ed324..4c8d6cc38 100644 --- a/benchmark/dot.c +++ b/benchmark/dot.c @@ -108,7 +108,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *x, *y; FLOAT result; @@ -192,4 +192,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/geev.c b/benchmark/geev.c index 3b7465360..a2ca2c315 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -139,7 +139,7 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; FLOAT wkopt[4]; @@ -257,4 +257,4 @@ int MAIN__(int argc, char *argv[]){ return 0; } -void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); +// void main(int argc, char *argv[]) __attribute__((weak, alias("MAIN__"))); diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 4f9a58825..5a3587622 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -118,14 +118,15 @@ static void *huge_malloc(BLASLONG size){ #endif -int MAIN__(int argc, char *argv[]){ +int main(int argc, char *argv[]){ FLOAT *a, *b, *c; FLOAT alpha[] = {1.0, 1.0}; FLOAT beta [] = {1.0, 1.0}; char trans='N'; - blasint m, i, j; + blasint m, n, i, j; int loops = 1; + int has_param_n=0; int l; char *p; @@ -162,6 +163,11 @@ int MAIN__(int argc, char *argv[]){ if ( p != NULL ) loops = atoi(p); + if ((p = getenv("OPENBLAS_PARAM_N"))) { + n = atoi(p); + has_param_n=1; + } + #ifdef linux srandom(getpid()); @@ -174,7 +180,14 @@ int MAIN__(int argc, char *argv[]){ timeg=0; - fprintf(stderr, " %6d : ", (int)m); + if ( has_param_n == 1 && n <= m ) + n=n; + else + n=m; + + + + fprintf(stderr, " %6dx%d : ", (int)m, (int)n); for (l=0; l comatcopy_k_rnc #define COMATCOPY_K_CTC gotoblas -> comatcopy_k_ctc #define COMATCOPY_K_RTC gotoblas -> comatcopy_k_rtc +#define CGEADD_K gotoblas -> cgeadd_k #endif diff --git a/common_d.h b/common_d.h index c34e1f28f..d6dfd7f04 100644 --- a/common_d.h +++ b/common_d.h @@ -149,6 +149,7 @@ #define DOMATCOPY_K_RN domatcopy_k_rn #define DOMATCOPY_K_CT domatcopy_k_ct #define DOMATCOPY_K_RT domatcopy_k_rt +#define DGEADD_K dgeadd_k #else @@ -267,6 +268,8 @@ #define DOMATCOPY_K_CT gotoblas -> domatcopy_k_ct #define DOMATCOPY_K_RT gotoblas -> domatcopy_k_rt +#define DGEADD_K gotoblas -> dgeadd_k + #endif #define DGEMM_NN dgemm_nn diff --git a/common_interface.h b/common_interface.h index ddd2cf6e5..15f69e02f 100644 --- a/common_interface.h +++ b/common_interface.h @@ -754,6 +754,12 @@ void BLASFUNC(dimatcopy) (char *, char *, blasint *, blasint *, double *, do void BLASFUNC(cimatcopy) (char *, char *, blasint *, blasint *, float *, float *, blasint *, blasint *); void BLASFUNC(zimatcopy) (char *, char *, blasint *, blasint *, double *, double *, blasint *, blasint *); +void BLASFUNC(sgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); +void BLASFUNC(dgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); +void BLASFUNC(cgeadd) (blasint *, blasint *, float *, float *, blasint *, float *, float *, blasint*); +void BLASFUNC(zgeadd) (blasint *, blasint *, double *, double *, blasint *, double *, double *, blasint*); + + #ifdef __cplusplus } diff --git a/common_level3.h b/common_level3.h index 0babd45b7..e0ecbc4e2 100644 --- a/common_level3.h +++ b/common_level3.h @@ -1762,6 +1762,11 @@ int zomatcopy_k_rnc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, dou int zomatcopy_k_ctc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); int zomatcopy_k_rtc(BLASLONG, BLASLONG, double, double, double *, BLASLONG, double *, BLASLONG); +int sgeadd_k(BLASLONG, BLASLONG, float, float*, BLASLONG, float, float *, BLASLONG); +int dgeadd_k(BLASLONG, BLASLONG, double, double*, BLASLONG, double, double *, BLASLONG); +int cgeadd_k(BLASLONG, BLASLONG, float, float, float*, BLASLONG, float, float, float *, BLASLONG); +int zgeadd_k(BLASLONG, BLASLONG, double,double, double*, BLASLONG, double, double, double *, BLASLONG); + #ifdef __CUDACC__ } diff --git a/common_macro.h b/common_macro.h index f9de3773a..8555baa67 100644 --- a/common_macro.h +++ b/common_macro.h @@ -634,7 +634,7 @@ #define OMATCOPY_K_RN DOMATCOPY_K_RN #define OMATCOPY_K_CT DOMATCOPY_K_CT #define OMATCOPY_K_RT DOMATCOPY_K_RT - +#define GEADD_K DGEADD_K #else #define AMAX_K SAMAX_K @@ -932,6 +932,7 @@ #define OMATCOPY_K_CT SOMATCOPY_K_CT #define OMATCOPY_K_RT SOMATCOPY_K_RT +#define GEADD_K SGEADD_K #endif #else #ifdef XDOUBLE @@ -1746,6 +1747,7 @@ #define OMATCOPY_K_RNC ZOMATCOPY_K_RNC #define OMATCOPY_K_CTC ZOMATCOPY_K_CTC #define OMATCOPY_K_RTC ZOMATCOPY_K_RTC +#define GEADD_K ZGEADD_K #else @@ -2159,6 +2161,8 @@ #define OMATCOPY_K_CTC COMATCOPY_K_CTC #define OMATCOPY_K_RTC COMATCOPY_K_RTC +#define GEADD_K CGEADD_K + #endif #endif diff --git a/common_param.h b/common_param.h index 49c1bf73b..1b56e85f0 100644 --- a/common_param.h +++ b/common_param.h @@ -855,6 +855,10 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); + int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); + int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); + int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); + int (*zgeadd_k) (BLASLONG, BLASLONG, float, double, double *, BLASLONG, double, double, double *, BLASLONG); } gotoblas_t; diff --git a/common_s.h b/common_s.h index 4e9b6dbe7..a4d8679b7 100644 --- a/common_s.h +++ b/common_s.h @@ -153,6 +153,7 @@ #define SOMATCOPY_K_CT somatcopy_k_ct #define SOMATCOPY_K_RT somatcopy_k_rt +#define SGEADD_K sgeadd_k #else @@ -274,6 +275,7 @@ #define SOMATCOPY_K_CT gotoblas -> somatcopy_k_ct #define SOMATCOPY_K_RT gotoblas -> somatcopy_k_rt +#define SGEADD_K gotoblas -> sgeadd_k #endif diff --git a/common_x86.h b/common_x86.h index f97fd348a..9d82090cc 100644 --- a/common_x86.h +++ b/common_x86.h @@ -171,7 +171,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define MMXSTORE movd #endif -#if defined(PILEDRIVER) || defined(BULLDOZER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_x86_64.h b/common_x86_64.h index 547614f74..e0a6c4c42 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -226,7 +226,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER -#if defined(PILEDRIVER) || defined(BULLDOZER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_z.h b/common_z.h index 133dea80c..b17122776 100644 --- a/common_z.h +++ b/common_z.h @@ -220,6 +220,7 @@ #define ZOMATCOPY_K_CTC zomatcopy_k_ctc #define ZOMATCOPY_K_RTC zomatcopy_k_rtc +#define ZGEADD_K zgeadd_k #else @@ -403,6 +404,8 @@ #define ZOMATCOPY_K_CTC gotoblas -> zomatcopy_k_ctc #define ZOMATCOPY_K_RTC gotoblas -> zomatcopy_k_rtc +#define ZGEADD_K gotoblas -> zgeadd_k + #endif #define ZGEMM_NN zgemm_nn diff --git a/cpuid.h b/cpuid.h index cb4404cb0..ab6a3fb32 100644 --- a/cpuid.h +++ b/cpuid.h @@ -104,10 +104,11 @@ #define CORE_ATOM 18 #define CORE_NANO 19 #define CORE_SANDYBRIDGE 20 -#define CORE_BOBCAT 21 -#define CORE_BULLDOZER 22 +#define CORE_BOBCAT 21 +#define CORE_BULLDOZER 22 #define CORE_PILEDRIVER 23 -#define CORE_HASWELL 24 +#define CORE_HASWELL 24 +#define CORE_STEAMROLLER 25 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -200,6 +201,7 @@ typedef struct { #define CPUTYPE_BOBCAT 45 #define CPUTYPE_BULLDOZER 46 #define CPUTYPE_PILEDRIVER 47 -#define CPUTYPE_HASWELL 48 +#define CPUTYPE_HASWELL 48 +#define CPUTYPE_STEAMROLLER 49 #endif diff --git a/cpuid_arm.c b/cpuid_arm.c index b7181b2f9..51ba72d70 100644 --- a/cpuid_arm.c +++ b/cpuid_arm.c @@ -30,16 +30,27 @@ #define CPU_UNKNOWN 0 #define CPU_ARMV6 1 #define CPU_ARMV7 2 -#define CPU_CORTEXA15 3 +#define CPU_CORTEXA9 3 +#define CPU_CORTEXA15 4 static char *cpuname[] = { "UNKOWN", "ARMV6", "ARMV7", + "CORTEXA9", "CORTEXA15" }; +static char *cpuname_lower[] = { + "unknown", + "armv6", + "armv7", + "cortexa9", + "cortexa15" +}; + + int get_feature(char *search) { @@ -85,6 +96,29 @@ int detect(void) char buffer[512], *p; p = (char *) NULL ; + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)) + { + + if (!strncmp("CPU part", buffer, 8)) + { + p = strchr(buffer, ':') + 2; + break; + } + } + + fclose(infile); + if(p != NULL) { + if (strstr(p, "0xc09")) { + return CPU_CORTEXA9; + } + if (strstr(p, "0xc0f")) { + return CPU_CORTEXA15; + } + + } + + p = (char *) NULL ; infile = fopen("/proc/cpuinfo", "r"); while (fgets(buffer, sizeof(buffer), infile)) @@ -142,21 +176,7 @@ void get_architecture(void) void get_subarchitecture(void) { int d = detect(); - switch (d) - { - - case CPU_ARMV7: - printf("ARMV7"); - break; - - case CPU_ARMV6: - printf("ARMV6"); - break; - - default: - printf("UNKNOWN"); - break; - } + printf("%s", cpuname[d]); } void get_subdirname(void) @@ -170,6 +190,36 @@ void get_cpuconfig(void) int d = detect(); switch (d) { + case CPU_CORTEXA9: + printf("#define CORTEXA9\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + if ( get_feature("neon")) printf("#define HAVE_NEON\n"); + if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 128\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + + case CPU_CORTEXA15: + printf("#define CORTEXA15\n"); + printf("#define HAVE_VFP\n"); + printf("#define HAVE_VFPV3\n"); + if ( get_feature("neon")) printf("#define HAVE_NEON\n"); + if ( get_feature("vfpv4")) printf("#define HAVE_VFPV4\n"); + printf("#define L1_DATA_SIZE 32768\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 1048576\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 128\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); + break; + case CPU_ARMV7: printf("#define ARMV7\n"); @@ -206,18 +256,7 @@ void get_libname(void) { int d = detect(); - switch (d) - { - - case CPU_ARMV7: - printf("armv7\n"); - break; - - case CPU_ARMV6: - printf("armv6\n"); - break; - - } + printf("%s", cpuname_lower[d]); } diff --git a/cpuid_x86.c b/cpuid_x86.c index 44446e582..ef90b26d8 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1162,6 +1162,12 @@ int get_cpuname(void){ return CPUTYPE_PILEDRIVER; else return CPUTYPE_BARCELONA; //OS don't support AVX. + case 0: + if(support_avx()) + return CPUTYPE_STEAMROLLER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. + } break; case 5: @@ -1290,6 +1296,7 @@ static char *cpuname[] = { "BULLDOZER", "PILEDRIVER", "HASWELL", + "STEAMROLLER", }; static char *lowercpuname[] = { @@ -1341,6 +1348,7 @@ static char *lowercpuname[] = { "bulldozer", "piledriver", "haswell", + "steamroller", }; static char *corename[] = { @@ -1369,6 +1377,7 @@ static char *corename[] = { "BULLDOZER", "PILEDRIVER", "HASWELL", + "STEAMROLLER", }; static char *corename_lower[] = { @@ -1397,6 +1406,7 @@ static char *corename_lower[] = { "bulldozer", "piledriver", "haswell", + "steamroller", }; @@ -1562,7 +1572,15 @@ int get_coretype(void){ return CORE_PILEDRIVER; else return CORE_BARCELONA; //OS don't support AVX. + + case 0: + if(support_avx()) + return CORE_STEAMROLLER; + else + return CORE_BARCELONA; //OS don't support AVX. } + + }else return CORE_BARCELONA; } } diff --git a/driver/others/Makefile b/driver/others/Makefile index fc73871cc..ed145cee8 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,7 +1,7 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_num_threads.$(SUFFIX) openblas_get_num_procs.$(SUFFIX) openblas_get_config.$(SUFFIX) openblas_get_parallel.$(SUFFIX) openblas_error_handle.$(SUFFIX) #COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) @@ -103,6 +103,12 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c $(CC) $(CFLAGS) -c $< -o $(@F) +openblas_get_num_threads.$(SUFFIX) : openblas_get_num_threads.c + $(CC) $(CFLAGS) -c $< -o $(@F) + +openblas_get_num_procs.$(SUFFIX) : openblas_get_num_procs.c + $(CC) $(CFLAGS) -c $< -o $(@F) + openblas_get_config.$(SUFFIX) : openblas_get_config.c $(CC) $(CFLAGS) -c $< -o $(@F) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1235df2db..60b3c72af 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -66,6 +66,7 @@ extern gotoblas_t gotoblas_BOBCAT; extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_PILEDRIVER; +extern gotoblas_t gotoblas_STEAMROLLER; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE #else @@ -77,6 +78,7 @@ extern gotoblas_t gotoblas_HASWELL; #define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA +#define gotoblas_STEAMROLLER gotoblas_BARCELONA #endif @@ -275,7 +277,17 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } + }else if(model == 0){ + //AMD STEAMROLLER + if(support_avx()) + return &gotoblas_STEAMROLLER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } } + + } else { return &gotoblas_BARCELONA; } @@ -315,6 +327,7 @@ static char *corename[] = { "Bulldozer", "Piledriver", "Haswell", + "Steamroller", }; char *gotoblas_corename(void) { @@ -339,6 +352,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_HASWELL) return corename[20]; + if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; return corename[0]; } @@ -349,9 +363,9 @@ static gotoblas_t *force_coretype(char *coretype){ int i ; int found = -1; char message[128]; - char mname[20]; + //char mname[20]; - for ( i=1 ; i <= 20; i++) + for ( i=1 ; i <= 21; i++) { if (!strncasecmp(coretype,corename[i],20)) { @@ -361,8 +375,8 @@ static gotoblas_t *force_coretype(char *coretype){ } if (found < 0) { - strncpy(mname,coretype,20); - sprintf(message, "Core not found: %s\n",mname); + //strncpy(mname,coretype,20); + snprintf(message, 128, "Core not found: %s\n",coretype); openblas_warning(1, message); return(NULL); } @@ -370,6 +384,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { + case 21: return (&gotoblas_STEAMROLLER); case 20: return (&gotoblas_HASWELL); case 19: return (&gotoblas_PILEDRIVER); case 18: return (&gotoblas_BULLDOZER); diff --git a/driver/others/memory.c b/driver/others/memory.c index 16d68cced..4010ec974 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -241,6 +241,7 @@ void set_stack_limit(int limitMB){ */ #endif + /* OpenBLAS uses the numbers of CPU cores in multithreading. It can be set by openblas_set_num_threads(int num_threads); @@ -323,6 +324,23 @@ int blas_get_cpu_number(void){ } #endif + +int openblas_get_num_procs(void) { +#ifndef SMP + return 1; +#else + return get_num_procs(); +#endif +} + +int openblas_get_num_threads(void) { +#ifndef SMP + return 1; +#else + return blas_get_cpu_number(); +#endif +} + struct release_t { void *address; void (*func)(struct release_t *); @@ -1335,6 +1353,8 @@ void DESTRUCTOR gotoblas_quit(void) { if (gotoblas_initialized == 0) return; + blas_shutdown(); + #ifdef PROFILE moncontrol (0); #endif @@ -1356,8 +1376,6 @@ void DESTRUCTOR gotoblas_quit(void) { #ifdef PROFILE moncontrol (1); #endif - - blas_shutdown(); } #if (defined(C_PGI) || (!defined(C_SUN) && defined(F_INTERFACE_SUN))) && (defined(ARCH_X86) || defined(ARCH_X86_64)) diff --git a/driver/others/openblas_get_num_procs.c b/driver/others/openblas_get_num_procs.c new file mode 100644 index 000000000..6b0c1ec5c --- /dev/null +++ b/driver/others/openblas_get_num_procs.c @@ -0,0 +1,40 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +extern int openblas_get_num_procs(void); + +int openblas_get_num_procs_(void) { + return openblas_get_num_procs(); +} diff --git a/driver/others/openblas_get_num_threads.c b/driver/others/openblas_get_num_threads.c new file mode 100644 index 000000000..e31aa4b4a --- /dev/null +++ b/driver/others/openblas_get_num_threads.c @@ -0,0 +1,40 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +extern int openblas_get_num_threads(void); + +int openblas_get_num_threads_(void) { + return openblas_get_num_threads(); +} diff --git a/driver/others/parameter.c b/driver/others/parameter.c index f0f889a15..d741f2fb9 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -166,7 +166,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ void blas_set_parameter(void){ env_var_t p; int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) int size = 16; #else int size = get_L2_size(); diff --git a/exports/Makefile b/exports/Makefile index f2f688191..1fdaf2213 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -100,7 +100,12 @@ else $(OBJCONV) @objconv.def ../$(LIBNAME) ../$(LIBNAME).renamed $(LIBDYNNAME) : ../$(LIBNAME).renamed osx.def endif +ifeq ($(NOFORTRAN), 2) +#only build cblas without Fortran + $(CC) $(CFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) +else $(FC) $(FFLAGS) -all_load -headerpad_max_install_names -install_name $(CURDIR)/../$(LIBDYNNAME) -dynamiclib -o ../$(LIBDYNNAME) $< -Wl,-exported_symbols_list,osx.def $(FEXTRALIB) +endif dllinit.$(SUFFIX) : dllinit.c $(CC) $(CFLAGS) -c -o $(@F) -s $< diff --git a/exports/gensymbol b/exports/gensymbol index 8bd2f17af..12ca7376c 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -23,7 +23,8 @@ zhpr,zrotg,zscal,zswap,zsymm,zsyr2k,zsyrk,ztbmv, ztbsv,ztpmv,ztpsv,ztrmm,ztrmv,ztrsm,ztrsv, zsymv, xerbla, - saxpby,daxpby,caxpby,zaxpby + saxpby,daxpby,caxpby,zaxpby, + sgeadd,dgeadd,cgeadd,zgeadd, ); @cblasobjs = ( @@ -55,6 +56,7 @@ cblas_saxpby,cblas_daxpby,cblas_caxpby,cblas_zaxpby, cblas_somatcopy, cblas_domatcopy, cblas_comatcopy, cblas_zomatcopy, cblas_simatcopy, cblas_dimatcopy, cblas_cimatcopy, cblas_zimatcopy, + cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd ); @exblasobjs = ( @@ -81,7 +83,10 @@ #both underscore and no underscore @misc_common_objs = ( - openblas_set_num_threads, openblas_get_parallel, + openblas_get_parallel, + openblas_get_num_procs, + openblas_set_num_threads, + openblas_get_num_threads, ); @misc_no_underscore_objs = ( diff --git a/getarch.c b/getarch.c index 81ab9e37c..ee5f55fd1 100644 --- a/getarch.c +++ b/getarch.c @@ -432,6 +432,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "PILEDRIVER" #endif +#if defined (FORCE_STEAMROLLER) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "STEAMROLLER" +#define ARCHCONFIG "-DSTEAMROLLER " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ + "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" +#define LIBNAME "steamroller" +#define CORENAME "STEAMROLLER" +#endif + + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL @@ -710,6 +727,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #endif +#ifdef FORCE_CORTEXA9 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "CORTEXA9" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DCORTEXA9 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" +#define LIBNAME "cortexa9" +#define CORENAME "CORTEXA9" +#else +#endif + +#ifdef FORCE_CORTEXA15 +#define FORCE +#define ARCHITECTURE "ARM" +#define SUBARCHITECTURE "CORTEXA15" +#define SUBDIRNAME "arm" +#define ARCHCONFIG "-DCORTEXA15 " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \ + "-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \ + "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \ + "-DHAVE_VFPV3 -DHAVE_VFP -DHAVE_NEON" +#define LIBNAME "cortexa15" +#define CORENAME "CORTEXA15" +#else +#endif + #ifdef FORCE_ARMV6 #define FORCE #define ARCHITECTURE "ARM" diff --git a/interface/Makefile b/interface/Makefile index 54699b7e3..1666d9145 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -43,7 +43,8 @@ SBLAS2OBJS = \ SBLAS3OBJS = \ sgemm.$(SUFFIX) ssymm.$(SUFFIX) strmm.$(SUFFIX) \ strsm.$(SUFFIX) ssyrk.$(SUFFIX) ssyr2k.$(SUFFIX) \ - somatcopy.$(SUFFIX) simatcopy.$(SUFFIX) + somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ + sgeadd.$(SUFFIX) DBLAS1OBJS = \ @@ -68,7 +69,8 @@ DBLAS2OBJS = \ DBLAS3OBJS = \ dgemm.$(SUFFIX) dsymm.$(SUFFIX) dtrmm.$(SUFFIX) \ dtrsm.$(SUFFIX) dsyrk.$(SUFFIX) dsyr2k.$(SUFFIX) \ - domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX) + domatcopy.$(SUFFIX) dimatcopy.$(SUFFIX)\ + dgeadd.$(SUFFIX) CBLAS1OBJS = \ caxpy.$(SUFFIX) caxpyc.$(SUFFIX) cswap.$(SUFFIX) \ @@ -96,7 +98,8 @@ CBLAS3OBJS = \ cgemm.$(SUFFIX) csymm.$(SUFFIX) ctrmm.$(SUFFIX) \ ctrsm.$(SUFFIX) csyrk.$(SUFFIX) csyr2k.$(SUFFIX) \ chemm.$(SUFFIX) cherk.$(SUFFIX) cher2k.$(SUFFIX) \ - comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX) + comatcopy.$(SUFFIX) cimatcopy.$(SUFFIX)\ + cgeadd.$(SUFFIX) ZBLAS1OBJS = \ zaxpy.$(SUFFIX) zaxpyc.$(SUFFIX) zswap.$(SUFFIX) \ @@ -124,7 +127,8 @@ ZBLAS3OBJS = \ zgemm.$(SUFFIX) zsymm.$(SUFFIX) ztrmm.$(SUFFIX) \ ztrsm.$(SUFFIX) zsyrk.$(SUFFIX) zsyr2k.$(SUFFIX) \ zhemm.$(SUFFIX) zherk.$(SUFFIX) zher2k.$(SUFFIX) \ - zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX) + zomatcopy.$(SUFFIX) zimatcopy.$(SUFFIX)\ + zgeadd.$(SUFFIX) ifeq ($(SUPPORT_GEMM3M), 1) @@ -269,7 +273,8 @@ CSBLAS2OBJS = \ CSBLAS3OBJS = \ cblas_sgemm.$(SUFFIX) cblas_ssymm.$(SUFFIX) cblas_strmm.$(SUFFIX) cblas_strsm.$(SUFFIX) \ - cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX) + cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ + cblas_sgeadd.$(SUFFIX) CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ @@ -285,7 +290,8 @@ CDBLAS2OBJS = \ CDBLAS3OBJS += \ cblas_dgemm.$(SUFFIX) cblas_dsymm.$(SUFFIX) cblas_dtrmm.$(SUFFIX) cblas_dtrsm.$(SUFFIX) \ - cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) + cblas_dsyrk.$(SUFFIX) cblas_dsyr2k.$(SUFFIX) cblas_domatcopy.$(SUFFIX) cblas_dimatcopy.$(SUFFIX) \ + cblas_dgeadd.$(SUFFIX) CCBLAS1OBJS = \ cblas_icamax.$(SUFFIX) cblas_scasum.$(SUFFIX) cblas_caxpy.$(SUFFIX) \ @@ -308,7 +314,9 @@ CCBLAS3OBJS = \ cblas_cgemm.$(SUFFIX) cblas_csymm.$(SUFFIX) cblas_ctrmm.$(SUFFIX) cblas_ctrsm.$(SUFFIX) \ cblas_csyrk.$(SUFFIX) cblas_csyr2k.$(SUFFIX) \ cblas_chemm.$(SUFFIX) cblas_cherk.$(SUFFIX) cblas_cher2k.$(SUFFIX) \ - cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX) + cblas_comatcopy.$(SUFFIX) cblas_cimatcopy.$(SUFFIX)\ + cblas_cgeadd.$(SUFFIX) + CZBLAS1OBJS = \ @@ -332,7 +340,9 @@ CZBLAS3OBJS = \ cblas_zgemm.$(SUFFIX) cblas_zsymm.$(SUFFIX) cblas_ztrmm.$(SUFFIX) cblas_ztrsm.$(SUFFIX) \ cblas_zsyrk.$(SUFFIX) cblas_zsyr2k.$(SUFFIX) \ cblas_zhemm.$(SUFFIX) cblas_zherk.$(SUFFIX) cblas_zher2k.$(SUFFIX)\ - cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) + cblas_zomatcopy.$(SUFFIX) cblas_zimatcopy.$(SUFFIX) \ + cblas_zgeadd.$(SUFFIX) + ifeq ($(SUPPORT_GEMM3M), 1) @@ -2103,4 +2113,27 @@ zimatcopy.$(SUFFIX) zimatcopy.$(PSUFFIX) : zimatcopy.c cblas_zimatcopy.$(SUFFIX) cblas_zimatcopy.$(PSUFFIX) : zimatcopy.c $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) +sgeadd.$(SUFFIX) sgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +dgeadd.$(SUFFIX) dgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cgeadd.$(SUFFIX) cgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +zgeadd.$(SUFFIX) zgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) $< -o $(@F) + +cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_dgeadd.$(SUFFIX) cblas_dgeadd.$(PSUFFIX) : geadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_cgeadd.$(SUFFIX) cblas_cgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) + +cblas_zgeadd.$(SUFFIX) cblas_zgeadd.$(PSUFFIX) : zgeadd.c + $(CC) -c $(CFLAGS) -DCBLAS $< -o $(@F) diff --git a/interface/geadd.c b/interface/geadd.c new file mode 100644 index 000000000..f0befa14a --- /dev/null +++ b/interface/geadd.c @@ -0,0 +1,148 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#if defined(DOUBLE) +#define ERROR_NAME "DGEADD " +#else +#define ERROR_NAME "SGEADD " +#endif + +#ifndef CBLAS + +void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *BETA, FLOAT *c, blasint *LDC) +{ + + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint ldc = *LDC; + FLOAT alpha = *ALPHA; + FLOAT beta = *BETA; + + blasint info; + + PRINT_DEBUG_NAME; + + info = 0; + + + if (lda < MAX(1, m)) info = 6; + if (ldc < MAX(1, m)) info = 8; + + if (n < 0) info = 2; + if (m < 0) info = 1; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else +void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT alpha, FLOAT *a, blasint lda, FLOAT beta, + FLOAT *c, blasint ldc) +{ +/* +void CNAME(enum CBLAS_ORDER order, + blasint m, blasint n, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT beta, + FLOAT *c, blasint ldc){ */ + + blasint info, t; + + PRINT_DEBUG_CNAME; + + info = 0; + + if (order == CblasColMajor) { + + info = -1; + + if (ldc < MAX(1, m)) info = 8; + if (lda < MAX(1, m)) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + + } + + if (order == CblasRowMajor) { + info = -1; + + t = n; + n = m; + m = t; + + if (ldc < MAX(1, m)) info = 8; + if (lda < MAX(1, m)) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + + GEADD_K(m,n,alpha, a, lda, beta, c, ldc); + + + FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/gemv.c b/interface/gemv.c index 2dd82dce5..f33973ef3 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -208,7 +208,20 @@ void CNAME(enum CBLAS_ORDER order, if (incx < 0) x -= (lenx - 1) * incx; if (incy < 0) y -= (leny - 1) * incy; +#ifdef MAX_STACK_ALLOC + // make it volatile because some gemv implementation (ex: dgemv_n.S) + // do not restore all register + volatile int stack_alloc_size = m + n; + if(stack_alloc_size < 128) + //dgemv_n.S require a 128 bytes buffer + stack_alloc_size = 128; + if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) + stack_alloc_size = 0; + FLOAT stack_buffer[stack_alloc_size]; + buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); +#else buffer = (FLOAT *)blas_memory_alloc(1); +#endif #ifdef SMP @@ -237,7 +250,10 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); +#ifdef MAX_STACK_ALLOC + if(!stack_alloc_size) +#endif + blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); diff --git a/interface/ger.c b/interface/ger.c index 9857d2423..9dd2dc58b 100644 --- a/interface/ger.c +++ b/interface/ger.c @@ -171,7 +171,15 @@ void CNAME(enum CBLAS_ORDER order, if (incy < 0) y -= (n - 1) * incy; if (incx < 0) x -= (m - 1) * incx; +#ifdef MAX_STACK_ALLOC + volatile int stack_alloc_size = m; + if(stack_alloc_size > MAX_STACK_ALLOC / sizeof(FLOAT)) + stack_alloc_size = 0; + FLOAT stack_buffer[stack_alloc_size]; + buffer = stack_alloc_size ? stack_buffer : (FLOAT *)blas_memory_alloc(1); +#else buffer = (FLOAT *)blas_memory_alloc(1); +#endif #ifdef SMPTEST nthreads = num_cpu_avail(2); @@ -190,7 +198,10 @@ void CNAME(enum CBLAS_ORDER order, } #endif - blas_memory_free(buffer); +#ifdef MAX_STACK_ALLOC + if(!stack_alloc_size) +#endif + blas_memory_free(buffer); FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); diff --git a/interface/zgeadd.c b/interface/zgeadd.c new file mode 100644 index 000000000..7124cf230 --- /dev/null +++ b/interface/zgeadd.c @@ -0,0 +1,146 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#if defined(DOUBLE) +#define ERROR_NAME "ZGEADD " +#else +#define ERROR_NAME "CGEADD " +#endif + +#ifndef CBLAS + +void NAME(blasint *M, blasint *N, FLOAT *ALPHA, FLOAT *a, blasint *LDA, + FLOAT *BETA, FLOAT *c, blasint *LDC) +{ + + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint ldc = *LDC; + + blasint info; + + PRINT_DEBUG_NAME; + + info = 0; + + + if (lda < MAX(1, m)) info = 6; + if (ldc < MAX(1, m)) info = 8; + + if (n < 0) info = 2; + if (m < 0) info = 1; + + if (info != 0){ + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else +void CNAME( enum CBLAS_ORDER order, blasint m, blasint n, FLOAT *ALPHA, FLOAT *a, blasint lda, FLOAT *BETA, + FLOAT *c, blasint ldc) +{ +/* +void CNAME(enum CBLAS_ORDER order, + blasint m, blasint n, + FLOAT alpha, + FLOAT *a, blasint lda, + FLOAT beta, + FLOAT *c, blasint ldc){ */ + + blasint info, t; + + PRINT_DEBUG_CNAME; + + info = 0; + + if (order == CblasColMajor) { + + info = -1; + + if (ldc < MAX(1, m)) info = 8; + if (lda < MAX(1, m)) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + + } + + if (order == CblasRowMajor) { + info = -1; + + t = n; + n = m; + m = t; + + if (ldc < MAX(1, m)) info = 8; + if (lda < MAX(1, m)) info = 5; + if (n < 0) info = 2; + if (m < 0) info = 1; + } + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + + IDEBUG_START; + + FUNCTION_PROFILE_START(); + + + GEADD_K(m,n,ALPHA[0],ALPHA[1], a, lda, BETA[0], BETA[1], c, ldc); + + + FUNCTION_PROFILE_END(1, 2* m * n , 2 * m * n); + + IDEBUG_END; + + return; + +} diff --git a/interface/zsyr.c b/interface/zsyr.c index 5d62e8797..5fe29cefa 100644 --- a/interface/zsyr.c +++ b/interface/zsyr.c @@ -173,7 +173,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, int n, FLOAT alpha, FLO FUNCTION_PROFILE_START(); - if (incx < 0 ) x -= (n - 1) * incx; + if (incx < 0 ) x -= (n - 1) * incx * 2; buffer = (FLOAT *)blas_memory_alloc(1); diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 5702b7ac8..fdbae2daa 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -329,23 +329,27 @@ endif ###### BLAS extensions ##### SBLASOBJS += \ somatcopy_k_cn$(TSUFFIX).$(SUFFIX) somatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) + somatcopy_k_ct$(TSUFFIX).$(SUFFIX) somatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + sgeadd_k$(TSUFFIX).$(SUFFIX) DBLASOBJS += \ domatcopy_k_cn$(TSUFFIX).$(SUFFIX) domatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ - domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) + domatcopy_k_ct$(TSUFFIX).$(SUFFIX) domatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ + dgeadd_k$(TSUFFIX).$(SUFFIX) CBLASOBJS += \ comatcopy_k_cn$(TSUFFIX).$(SUFFIX) comatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ comatcopy_k_ct$(TSUFFIX).$(SUFFIX) comatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ comatcopy_k_cnc$(TSUFFIX).$(SUFFIX) comatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) + comatcopy_k_ctc$(TSUFFIX).$(SUFFIX) comatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + cgeadd_k$(TSUFFIX).$(SUFFIX) ZBLASOBJS += \ zomatcopy_k_cn$(TSUFFIX).$(SUFFIX) zomatcopy_k_rn$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_ct$(TSUFFIX).$(SUFFIX) zomatcopy_k_rt$(TSUFFIX).$(SUFFIX) \ zomatcopy_k_cnc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rnc$(TSUFFIX).$(SUFFIX) \ - zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) + zomatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ + zgeadd_k$(TSUFFIX).$(SUFFIX) SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -3440,3 +3444,31 @@ $(KDIR)zomatcopy_k_rtc$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZOMATCOPY_RTC) $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -DROWM -DCONJ $< -o $@ +ifndef SGEADD_K +SGEADD_K = ../generic/geadd.c +endif + +$(KDIR)sgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) + $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef DGEADD_K +DGEADD_K = ../generic/geadd.c +endif + +$(KDIR)dgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEADD_K) + $(CC) $(CFLAGS) -c -DDOUBLE -UCOMPLEX -UROWM $< -o $@ + +ifndef CGEADD_K +CGEADD_K = ../generic/zgeadd.c +endif + +$(KDIR)cgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEADD_K) + $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -UROWM $< -o $@ + +ifndef ZGEADD_K +ZGEADD_K = ../generic/zgeadd.c +endif + +$(KDIR)zgeadd_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEADD_K) + $(CC) $(CFLAGS) -c -DDOUBLE -DCOMPLEX -UROWM $< -o $@ + diff --git a/kernel/arm/KERNEL.CORTEXA15 b/kernel/arm/KERNEL.CORTEXA15 new file mode 100644 index 000000000..72e3ba02e --- /dev/null +++ b/kernel/arm/KERNEL.CORTEXA15 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV7 \ No newline at end of file diff --git a/kernel/arm/KERNEL.CORTEXA9 b/kernel/arm/KERNEL.CORTEXA9 new file mode 100644 index 000000000..72e3ba02e --- /dev/null +++ b/kernel/arm/KERNEL.CORTEXA9 @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.ARMV7 \ No newline at end of file diff --git a/kernel/generic/geadd.c b/kernel/generic/geadd.c new file mode 100644 index 000000000..062918b8c --- /dev/null +++ b/kernel/generic/geadd.c @@ -0,0 +1,64 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT beta, FLOAT *b, BLASLONG ldb) +{ + BLASLONG i; + FLOAT *aptr,*bptr; + + if ( rows <= 0 ) return(0); + if ( cols <= 0 ) return(0); + + + aptr = a; + bptr = b; + + if ( alpha == 0.0 ) + { + for ( i=0; i> 2 ; diff --git a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c index 40e318de3..6e822fba3 100644 --- a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c +++ b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c @@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" @@ -51,10 +51,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x @@ -70,13 +70,13 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x "prefetcht0 384(%4,%0,4) \n\t" @@ -107,9 +107,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $16, %1 \n\t" "vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_haswell-4.c b/kernel/x86_64/sgemv_t_microk_haswell-4.c index 016cb35e7..14fe1ecad 100644 --- a/kernel/x86_64/sgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c @@ -42,7 +42,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x @@ -54,10 +54,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x @@ -69,14 +69,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $8 , %1 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x @@ -96,9 +96,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $16, %0 \n\t" "subq $16, %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_nehalem-4.c b/kernel/x86_64/sgemv_t_microk_nehalem-4.c index 4a167900e..4f07d9640 100644 --- a/kernel/x86_64/sgemv_t_microk_nehalem-4.c +++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c @@ -41,7 +41,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "xorps %%xmm7 , %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0 @@ -60,7 +60,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addps %%xmm10, %%xmm6 \n\t" "addps %%xmm11, %%xmm7 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "haddps %%xmm4, %%xmm4 \n\t" "haddps %%xmm5, %%xmm5 \n\t" diff --git a/kernel/x86_64/sgemv_t_microk_sandy-4.c b/kernel/x86_64/sgemv_t_microk_sandy-4.c index 6550518f7..76868ab14 100644 --- a/kernel/x86_64/sgemv_t_microk_sandy-4.c +++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c @@ -46,7 +46,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" "testq $0x04, %1 \n\t" - "jz .L08LABEL%= \n\t" + "jz 2f \n\t" "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x @@ -61,10 +61,10 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $4 , %1 \n\t" "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t" - ".L08LABEL%=: \n\t" + "2: \n\t" "testq $0x08, %1 \n\t" - "jz .L16LABEL%= \n\t" + "jz 3f \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x @@ -79,14 +79,14 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $8 , %1 \n\t" "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t" - ".L16LABEL%=: \n\t" + "3: \n\t" "cmpq $0, %1 \n\t" - "je .L16END%= \n\t" + "je 4f \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 384(%2,%0,4) \n\t" "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x @@ -114,9 +114,9 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "subq $16, %1 \n\t" "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" - ".L16END%=: \n\t" + "4: \n\t" "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" diff --git a/kernel/x86_64/ssymv_L.c b/kernel/x86_64/ssymv_L.c index 096adc6ca..a2b716b58 100644 --- a/kernel/x86_64/ssymv_L.c +++ b/kernel/x86_64/ssymv_L.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "ssymv_L_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_L_microk_nehalem-2.c" diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c index c9206c1be..9002228f3 100644 --- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c @@ -44,7 +44,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vbroadcastss 12(%8), %%xmm7 \n\t" // temp1[3] ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%4,%0,4), %%xmm12 \n\t" // 2 * a "vmovups (%2,%0,4), %%xmm8 \n\t" // 2 * x @@ -71,7 +71,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL "vmovups %%xmm9 , -16(%3,%0,4) \n\t" "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovss (%9), %%xmm4 \n\t" "vmovss 4(%9), %%xmm5 \n\t" diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c index a1c62caf6..fb5337946 100644 --- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c @@ -48,7 +48,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "shufps $0, %%xmm7, %%xmm7 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y @@ -86,7 +86,7 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F "addq $4 , %0 \n\t" "cmpq %0 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "movss (%9), %%xmm4 \n\t" // temp1[0] "movss 4(%9), %%xmm5 \n\t" // temp1[1] diff --git a/kernel/x86_64/ssymv_U.c b/kernel/x86_64/ssymv_U.c index 61127aa3d..0aadd3fd2 100644 --- a/kernel/x86_64/ssymv_U.c +++ b/kernel/x86_64/ssymv_U.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "ssymv_U_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ssymv_U_microk_nehalem-2.c" diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c index b8b3b73e9..8c01ab806 100644 --- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c @@ -47,7 +47,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovups (%2,%0,4), %%xmm8 \n\t" // 4 * x "vmovups (%3,%0,4), %%xmm9 \n\t" // 4 * y @@ -73,7 +73,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vhaddps %%xmm0, %%xmm0, %%xmm0 \n\t" "vhaddps %%xmm1, %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c index 9505a395a..2fb8f4494 100644 --- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c +++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c @@ -51,7 +51,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "xorq %0,%0 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "movups (%2,%0,4), %%xmm8 \n\t" // 4 * x "movups (%3,%0,4), %%xmm9 \n\t" // 4 * y @@ -89,7 +89,7 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT "addq $4 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "haddps %%xmm0, %%xmm0 \n\t" "haddps %%xmm1, %%xmm1 \n\t" diff --git a/kernel/x86_64/zaxpy.c b/kernel/x86_64/zaxpy.c index ca2f03dd0..52a25c793 100644 --- a/kernel/x86_64/zaxpy.c +++ b/kernel/x86_64/zaxpy.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "zaxpy_microk_bulldozer-2.c" #endif diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c index 780109b69..f9732cd4e 100644 --- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c +++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c @@ -40,7 +40,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "vmovddup 8(%4), %%xmm1 \n\t" // imag part of alpha ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 768(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%xmm5 \n\t" // 1 complex values from x @@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" : : diff --git a/kernel/x86_64/zgemv_n_microk_haswell-4.c b/kernel/x86_64/zgemv_n_microk_haswell-4.c index 61358508a..b38cc5763 100644 --- a/kernel/x86_64/zgemv_n_microk_haswell-4.c +++ b/kernel/x86_64/zgemv_n_microk_haswell-4.c @@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -111,7 +111,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -153,7 +153,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -199,7 +199,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -237,7 +237,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 "vmovups 32(%4,%0,8), %%ymm9 \n\t" // 2 complex values form a0 @@ -273,7 +273,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -339,7 +339,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src "vmovups 32(%2,%0,8), %%ymm9 \n\t" @@ -375,7 +375,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/zgemv_n_microk_sandy-4.c b/kernel/x86_64/zgemv_n_microk_sandy-4.c index 009e4801e..82fc543de 100644 --- a/kernel/x86_64/zgemv_n_microk_sandy-4.c +++ b/kernel/x86_64/zgemv_n_microk_sandy-4.c @@ -48,7 +48,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" //"prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 @@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -165,7 +165,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "vbroadcastsd 24(%2), %%ymm3 \n\t" // imag part x1 ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 @@ -216,7 +216,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -254,7 +254,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "vbroadcastsd 8(%2), %%ymm1 \n\t" // imag part x0 ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 256(%4,%0,8) \n\t" "vmovups (%4,%0,8), %%ymm8 \n\t" // 2 complex values form a0 @@ -291,7 +291,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : @@ -356,7 +356,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "vbroadcastsd (%5), %%ymm1 \n\t" // alpha_i ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" // "prefetcht0 192(%2,%0,8) \n\t" "vmovups (%2,%0,8), %%ymm8 \n\t" // 2 complex values from src "vmovups 32(%2,%0,8), %%ymm9 \n\t" @@ -392,7 +392,7 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest,FLOAT a "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vzeroupper \n\t" : diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 84cf4e2e8..4abb2d5ad 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "zgemv_t_microk_bulldozer-4.c" #elif defined(HASWELL) #include "zgemv_t_microk_haswell-4.c" diff --git a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c index 006db226b..792c7e952 100644 --- a/kernel/x86_64/zgemv_t_microk_bulldozer-4.c +++ b/kernel/x86_64/zgemv_t_microk_bulldozer-4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%xmm15, %%xmm15, %%xmm15 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 @@ -123,7 +123,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha @@ -236,7 +236,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%xmm11, %%xmm11, %%xmm11 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 @@ -286,7 +286,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha @@ -369,7 +369,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vxorpd %%xmm9 , %%xmm9 , %%xmm9 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 "vmovddup 8(%2,%0,8), %%xmm1 \n\t" // imag value from x0 @@ -404,7 +404,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vfmaddpd %%xmm8 , %%xmm5 , %%xmm2, %%xmm8 \n\t" // ar0*xr0,al0*xr0 "vfmaddpd %%xmm9 , %%xmm5 , %%xmm3, %%xmm9 \n\t" // ar0*xl0,al0*xl0 - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha diff --git a/kernel/x86_64/zgemv_t_microk_haswell-4.c b/kernel/x86_64/zgemv_t_microk_haswell-4.c index c87b5ce0f..8a851a54c 100644 --- a/kernel/x86_64/zgemv_t_microk_haswell-4.c +++ b/kernel/x86_64/zgemv_t_microk_haswell-4.c @@ -47,7 +47,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%ymm15, %%ymm15, %%ymm15 \n\t" ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 @@ -96,7 +96,7 @@ static void zgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%8) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%8) , %%xmm1 \n\t" // value from alpha @@ -220,7 +220,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "vxorpd %%ymm11, %%ymm11, %%ymm11 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 @@ -255,7 +255,7 @@ static void zgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%6) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%6) , %%xmm1 \n\t" // value from alpha @@ -342,7 +342,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "vxorpd %%ymm9 , %%ymm9 , %%ymm9 \n\t" // temp ".align 16 \n\t" - ".L01LOOP%=: \n\t" + "1: \n\t" "prefetcht0 192(%2,%0,8) \n\t" "vmovddup (%2,%0,8), %%xmm0 \n\t" // real value from x0 @@ -370,7 +370,7 @@ static void zgemv_kernel_4x1( BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT * "addq $8 , %0 \n\t" "subq $4 , %1 \n\t" - "jnz .L01LOOP%= \n\t" + "jnz 1b \n\t" "vmovddup (%5) , %%xmm0 \n\t" // value from alpha "vmovddup 8(%5) , %%xmm1 \n\t" // value from alpha diff --git a/lapack-netlib/BLAS/TESTING/cblat2.f b/lapack-netlib/BLAS/TESTING/cblat2.f index 5833ea81a..2a6edd382 100644 --- a/lapack-netlib/BLAS/TESTING/cblat2.f +++ b/lapack-netlib/BLAS/TESTING/cblat2.f @@ -120,7 +120,7 @@ REAL RZERO PARAMETER ( RZERO = 0.0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 65, INCMAX = 2 ) + PARAMETER ( NMAX = 128, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/cblat3.f b/lapack-netlib/BLAS/TESTING/cblat3.f index 09f2cb9c5..fb2aa4ece 100644 --- a/lapack-netlib/BLAS/TESTING/cblat3.f +++ b/lapack-netlib/BLAS/TESTING/cblat3.f @@ -102,7 +102,7 @@ REAL RZERO PARAMETER ( RZERO = 0.0 ) INTEGER NMAX - PARAMETER ( NMAX = 65 ) + PARAMETER ( NMAX = 128 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/BLAS/TESTING/dblat2.f b/lapack-netlib/BLAS/TESTING/dblat2.f index 0fa80afa4..80623b260 100644 --- a/lapack-netlib/BLAS/TESTING/dblat2.f +++ b/lapack-netlib/BLAS/TESTING/dblat2.f @@ -117,7 +117,7 @@ DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 65, INCMAX = 2 ) + PARAMETER ( NMAX = 128, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/dblat3.f b/lapack-netlib/BLAS/TESTING/dblat3.f index 8d37c7453..72c17ed3b 100644 --- a/lapack-netlib/BLAS/TESTING/dblat3.f +++ b/lapack-netlib/BLAS/TESTING/dblat3.f @@ -97,7 +97,7 @@ DOUBLE PRECISION ZERO, ONE PARAMETER ( ZERO = 0.0D0, ONE = 1.0D0 ) INTEGER NMAX - PARAMETER ( NMAX = 65 ) + PARAMETER ( NMAX = 128 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/BLAS/TESTING/sblat2.f b/lapack-netlib/BLAS/TESTING/sblat2.f index 71605ed31..601add7e9 100644 --- a/lapack-netlib/BLAS/TESTING/sblat2.f +++ b/lapack-netlib/BLAS/TESTING/sblat2.f @@ -117,7 +117,7 @@ REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 65, INCMAX = 2 ) + PARAMETER ( NMAX = 128, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/sblat3.f b/lapack-netlib/BLAS/TESTING/sblat3.f index 879269633..78d809379 100644 --- a/lapack-netlib/BLAS/TESTING/sblat3.f +++ b/lapack-netlib/BLAS/TESTING/sblat3.f @@ -97,7 +97,7 @@ REAL ZERO, ONE PARAMETER ( ZERO = 0.0, ONE = 1.0 ) INTEGER NMAX - PARAMETER ( NMAX = 65 ) + PARAMETER ( NMAX = 128 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/BLAS/TESTING/zblat2.f b/lapack-netlib/BLAS/TESTING/zblat2.f index 53129a11e..2e3e08e7c 100644 --- a/lapack-netlib/BLAS/TESTING/zblat2.f +++ b/lapack-netlib/BLAS/TESTING/zblat2.f @@ -121,7 +121,7 @@ DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX, INCMAX - PARAMETER ( NMAX = 65, INCMAX = 2 ) + PARAMETER ( NMAX = 128, INCMAX = 2 ) INTEGER NINMAX, NIDMAX, NKBMAX, NALMAX, NBEMAX PARAMETER ( NINMAX = 7, NIDMAX = 9, NKBMAX = 7, $ NALMAX = 7, NBEMAX = 7 ) diff --git a/lapack-netlib/BLAS/TESTING/zblat3.f b/lapack-netlib/BLAS/TESTING/zblat3.f index 59ca24145..39ce06b99 100644 --- a/lapack-netlib/BLAS/TESTING/zblat3.f +++ b/lapack-netlib/BLAS/TESTING/zblat3.f @@ -104,7 +104,7 @@ DOUBLE PRECISION RZERO PARAMETER ( RZERO = 0.0D0 ) INTEGER NMAX - PARAMETER ( NMAX = 65 ) + PARAMETER ( NMAX = 128 ) INTEGER NIDMAX, NALMAX, NBEMAX PARAMETER ( NIDMAX = 9, NALMAX = 7, NBEMAX = 7 ) * .. Local Scalars .. diff --git a/lapack-netlib/TESTING/sep.in b/lapack-netlib/TESTING/sep.in index e0ed58512..19bd7c3da 100644 --- a/lapack-netlib/TESTING/sep.in +++ b/lapack-netlib/TESTING/sep.in @@ -1,11 +1,11 @@ SEP: Data file for testing Symmetric Eigenvalue Problem routines -6 Number of values of N -0 1 2 3 5 20 Values of N (dimension) +8 Number of values of N +0 1 2 3 5 19 20 21 Values of N (dimension) 5 Number of values of NB 1 3 3 3 10 Values of NB (blocksize) 2 2 2 2 2 Values of NBMIN (minimum blocksize) 1 0 5 9 1 Values of NX (crossover point) -60.0 Threshold value +160.0 Threshold value T Put T to test the LAPACK routines T Put T to test the driver routines T Put T to test the error exits diff --git a/lapack-netlib/lapacke/include/lapacke.h b/lapack-netlib/lapacke/include/lapacke.h index a31c10d6d..e506319c2 100644 --- a/lapack-netlib/lapacke/include/lapacke.h +++ b/lapack-netlib/lapacke/include/lapacke.h @@ -10707,9 +10707,9 @@ lapack_int LAPACKE_zsyr_work( int matrix_order, char uplo, lapack_int n, const lapack_complex_double* x, lapack_int incx, lapack_complex_double* a, lapack_int lda ); -void LAPACKE_ilaver( const lapack_int* vers_major, - const lapack_int* vers_minor, - const lapack_int* vers_patch ); +void LAPACKE_ilaver( lapack_int* vers_major, + lapack_int* vers_minor, + lapack_int* vers_patch ); #define LAPACK_sgetrf LAPACK_GLOBAL(sgetrf,SGETRF) @@ -16435,8 +16435,8 @@ void LAPACK_csyr( char* uplo, lapack_int* n, lapack_complex_float* alpha, void LAPACK_zsyr( char* uplo, lapack_int* n, lapack_complex_double* alpha, const lapack_complex_double* x, lapack_int* incx, lapack_complex_double* a, lapack_int* lda ); -void LAPACK_ilaver( const lapack_int* vers_major, const lapack_int* vers_minor, - const lapack_int* vers_patch ); +void LAPACK_ilaver( lapack_int* vers_major, lapack_int* vers_minor, + lapack_int* vers_patch ); #ifdef __cplusplus } diff --git a/lapack-netlib/lapacke/include/lapacke_config.h b/lapack-netlib/lapacke/include/lapacke_config.h index 561b2736b..d46ed98e5 100644 --- a/lapack-netlib/lapacke/include/lapacke_config.h +++ b/lapack-netlib/lapacke/include/lapacke_config.h @@ -38,7 +38,6 @@ #if defined(LAPACK_COMPLEX_CPP) #include #endif -extern "C" { #endif /* __cplusplus */ #include @@ -63,8 +62,14 @@ extern "C" { #if defined(LAPACK_COMPLEX_STRUCTURE) +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ typedef struct { float real, imag; } _lapack_complex_float; typedef struct { double real, imag; } _lapack_complex_double; +#ifdef __cplusplus +} +#endif /* __cplusplus */ #define lapack_complex_float _lapack_complex_float #define lapack_complex_double _lapack_complex_double #define lapack_complex_float_real(z) ((z).real) @@ -103,8 +108,14 @@ typedef struct { double real, imag; } _lapack_complex_double; #endif +#ifdef __cplusplus +extern "C" { +#endif /* __cplusplus */ lapack_complex_float lapack_make_complex_float( float re, float im ); lapack_complex_double lapack_make_complex_double( double re, double im ); +#ifdef __cplusplus +} +#endif /* __cplusplus */ #endif @@ -116,8 +127,4 @@ lapack_complex_double lapack_make_complex_double( double re, double im ); #define LAPACK_free( p ) free( p ) #endif -#ifdef __cplusplus -} -#endif /* __cplusplus */ - #endif /* _LAPACKE_CONFIG_H_ */ diff --git a/lapack-netlib/lapacke/src/lapacke_ilaver.c b/lapack-netlib/lapacke/src/lapacke_ilaver.c index bec1d900b..ed362e90b 100644 --- a/lapack-netlib/lapacke/src/lapacke_ilaver.c +++ b/lapack-netlib/lapacke/src/lapacke_ilaver.c @@ -26,16 +26,16 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ***************************************************************************** -* Contents: Native high-level C interface to LAPACK function dgesv +* Contents: Native high-level C interface to LAPACK function ilaver * Author: Intel Corporation * Generated November, 2011 *****************************************************************************/ #include "lapacke_utils.h" -void LAPACKE_ilaver( const lapack_int* vers_major, - const lapack_int* vers_minor, - const lapack_int* vers_patch ) +void LAPACKE_ilaver( lapack_int* vers_major, + lapack_int* vers_minor, + lapack_int* vers_patch ) { /* Call LAPACK function */ LAPACK_ilaver( vers_major, vers_minor, vers_patch ); diff --git a/param.h b/param.h index 28ed91e60..18c711eb3 100644 --- a/param.h +++ b/param.h @@ -406,6 +406,99 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef STEAMROLLER +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + + + +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 +#define GEMV_UNROLL 8 +#endif + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 576 +#define ZGEMM_DEFAULT_P 288 +#define CGEMM_DEFAULT_P 576 +#else +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#endif +#define QGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 160 +#define ZGEMM_DEFAULT_Q 160 +#define CGEMM_DEFAULT_Q 160 +#else +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#endif +#define QGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#define SGEMM_DEFAULT_R 12288 +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R 12288 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + + #ifdef ATHLON #define SNUMOPT 4 @@ -1129,7 +1222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DGEMM_DEFAULT_UNROLL_M 8 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 8 -#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 @@ -2113,6 +2206,88 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +#ifdef CORTEXA9 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + +#ifdef CORTEXA15 +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 4 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + + + +#define SYMV_P 16 +#endif + + #ifdef GENERIC diff --git a/utest/common_utest.h b/utest/common_utest.h index e8377e681..d170ed27c 100644 --- a/utest/common_utest.h +++ b/utest/common_utest.h @@ -59,6 +59,7 @@ void test_zdotu_n_1(void); void test_zdotu_offset_1(void); void test_drotmg(void); +void test_drotmg_D1eqD2_X1eqX2(); void test_dsdot_n_1(void); diff --git a/utest/main.c b/utest/main.c index f44008b79..770d1451e 100644 --- a/utest/main.c +++ b/utest/main.c @@ -57,6 +57,7 @@ CU_TestInfo test_level1[]={ {"Testing zdotu with input x & y offset == 1",test_zdotu_offset_1}, {"Testing drotmg",test_drotmg}, + {"Testing drotmg with D1 == D2 && X1 == X2",test_drotmg_D1eqD2_X1eqX2}, {"Testing dsdot with n == 1",test_dsdot_n_1}, diff --git a/utest/test_rotmg.c b/utest/test_rotmg.c index b72446c1b..b175653a6 100644 --- a/utest/test_rotmg.c +++ b/utest/test_rotmg.c @@ -65,3 +65,36 @@ void test_drotmg() CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS); } } + +void test_drotmg_D1eqD2_X1eqX2() +{ + double te_d1, tr_d1; + double te_d2, tr_d2; + double te_x1, tr_x1; + double te_y1, tr_y1; + double te_param[5]; + double tr_param[5]; + int i=0; + te_d1= tr_d1=2.; + te_d2= tr_d2=2.; + te_x1= tr_x1=8.; + te_y1= tr_y1=8.; + + for(i=0; i<5; i++){ + te_param[i]=tr_param[i]=0.0; + } + + //OpenBLAS + BLASFUNC(drotmg)(&te_d1, &te_d2, &te_x1, &te_y1, te_param); + //reference + BLASFUNC_REF(drotmg)(&tr_d1, &tr_d2, &tr_x1, &tr_y1, tr_param); + + CU_ASSERT_DOUBLE_EQUAL(te_d1, tr_d1, CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(te_d2, tr_d2, CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(te_x1, tr_x1, CHECK_EPS); + CU_ASSERT_DOUBLE_EQUAL(te_y1, tr_y1, CHECK_EPS); + + for(i=0; i<5; i++){ + CU_ASSERT_DOUBLE_EQUAL(te_param[i], tr_param[i], CHECK_EPS); + } +}