From e396ec8b56511d84930e849b08d825af62b821a7 Mon Sep 17 00:00:00 2001 From: Martin Kroeker Date: Sun, 11 Oct 2020 15:11:15 +0200 Subject: [PATCH] Allow building support for only a subset of variable types --- CMakeLists.txt | 28 +++++---- Makefile | 15 ++++- Makefile.rule | 32 +++------- Makefile.tail | 4 +- common_param.h | 166 ++++++++++++++++++++++++++++++++++++++++++------- 5 files changed, 182 insertions(+), 63 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 954c053e4..f43e0e0fc 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -29,8 +29,10 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc else() set(NO_AFFINITY 1) endif() -option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF) -option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF) +option(BUILD_SINGLE "Single precision" OFF) +option(BUILD_DOUBLE "Double precision" OFF) +option(BUILD_COMPLEX "Single precision" OFF) +option(BUILD_COMPLEX16 "Single precision" OFF) # Add a prefix or suffix to all exported symbol names in the shared library. # Avoids conflicts with other BLAS libraries, especially when using @@ -108,28 +110,33 @@ endif() set(FLOAT_TYPES "") if (BUILD_SINGLE) - message(STATUS "Building Single Precision") - list(APPEND FLOAT_TYPES "SINGLE") # defines nothing + message(STATUS "Building Songle Precision") + list(APPEND FLOAT_TYPES "SINGLE") + # set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1") endif () if (BUILD_DOUBLE) message(STATUS "Building Double Precision") - list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE + list(APPEND FLOAT_TYPES "DOUBLE") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") endif () if (BUILD_COMPLEX) message(STATUS "Building Complex Precision") - list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX -endif () + list(APPEND FLOAT_TYPES "COMPLEX") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") +endif () if (BUILD_COMPLEX16) message(STATUS "Building Double Complex Precision") - list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE + list(APPEND FLOAT_TYPES "ZCOMPLEX") + #set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") endif () if (BUILD_HALF) message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "HALF") # defines nothing + list(APPEND FLOAT_TYPES "HALF") + set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF") endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") @@ -236,9 +243,6 @@ if (NOT MSVC AND NOT NOFORTRAN) add_subdirectory(ctest) endif() add_subdirectory(lapack-netlib/TESTING) - if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV) - add_subdirectory(cpp_thread_test) - endif() endif() set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES diff --git a/Makefile b/Makefile index 93e8af2eb..a9af62a22 100644 --- a/Makefile +++ b/Makefile @@ -146,9 +146,6 @@ ifneq ($(NO_CBLAS), 1) ifeq ($(CPP_THREAD_SAFETY_TEST), 1) $(MAKE) -C cpp_thread_test all endif -ifeq ($(CPP_THREAD_SAFETY_GEMV), 1) - $(MAKE) -C cpp_thread_test dgemv_tester -endif endif endif @@ -304,6 +301,18 @@ else endif ifeq ($(BUILD_LAPACK_DEPRECATED), 1) -@echo "BUILD_DEPRECATED = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_SINGLE), 1) + -@echo "BUILD_SINGLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_DOUBLE), 1) + -@echo "BUILD_DOUBLE = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX), 1) + -@echo "BUILD_COMPLEX = 1" >> $(NETLIB_LAPACK_DIR)/make.inc +endif +ifeq ($(BUILD_COMPLEX16), 1) + -@echo "BUILD_COMPLEX16 = 1" >> $(NETLIB_LAPACK_DIR)/make.inc endif -@echo "LAPACKE_WITH_TMG = 1" >> $(NETLIB_LAPACK_DIR)/make.inc -@cat make.inc >> $(NETLIB_LAPACK_DIR)/make.inc diff --git a/Makefile.rule b/Makefile.rule index 635e02c02..09dfb0881 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -272,33 +272,17 @@ COMMON_PROF = -pg # work at all. # # CPP_THREAD_SAFETY_TEST = 1 -# -# use this to run only the less memory-hungry GEMV test -# CPP_THREAD_SAFETY_GEMV = 1 # If you want to enable the experimental BFLOAT16 support # BUILD_HALF = 1 - - -# Set the thread number threshold beyond which the job array for the threaded level3 BLAS -# will be allocated on the heap rather than the stack. (This array alone requires -# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu -# counts, but obviously it is not the only item that ends up on the stack. -# The default value of 32 ensures that the overall requirement is compatible -# with the default 1MB stacksize imposed by having the Java VM loaded without use -# of its -Xss parameter. -# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible -# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java -# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code -# BLAS3_MEM_ALLOC_THRESHOLD = 160 - - - -# the below is not yet configurable, use cmake if you need to build only select types -BUILD_SINGLE = 1 -BUILD_DOUBLE = 1 -BUILD_COMPLEX = 1 -BUILD_COMPLEX16 = 1 +# +# Select if you need to build only select types +# BUILD_SINGLE = 1 +# BUILD_DOUBLE = 1 +# BUILD_COMPLEX = 1 +# BUILD_COMPLEX16 = 1 +# +# # End of user configuration # diff --git a/Makefile.tail b/Makefile.tail index cfc4a36fc..641082450 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -11,8 +11,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) $(CBAUXOBJS) +BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) $(CBAUXOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) diff --git a/common_param.h b/common_param.h index a52de98ab..81b479e53 100644 --- a/common_param.h +++ b/common_param.h @@ -146,40 +146,56 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); #endif + +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) int sgemm_p, sgemm_q, sgemm_r; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; +#endif int exclusive_cache; +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG); float (*smin_k) (BLASLONG, float *, BLASLONG); + BLASLONG (*isamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*isamin_k)(BLASLONG, float *, BLASLONG); BLASLONG (*ismax_k) (BLASLONG, float *, BLASLONG); BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); - float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG); +#endif +#if BUILD_SINGLE float (*ssum_k) (BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); - double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); + //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if BUILD_SINGLE int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) #ifdef ARCH_X86_64 void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); @@ -193,7 +209,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*sgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); - +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -215,7 +232,8 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); - +#endif +#if BUILD_SINGLE int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -242,13 +260,18 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG); int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); - +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int dgemm_p, dgemm_q, dgemm_r; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG); @@ -257,25 +280,37 @@ BLASLONG (*idamax_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idamin_k)(BLASLONG, double *, BLASLONG); BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG); +#endif +#if BUILD_DOUBLE double (*dsum_k) (BLASLONG, double *, BLASLONG); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); +#endif +#if (BUILD_SINGLE) || (BUILD_DOUBLE) + double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); - int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dswap_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); - int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); +#endif +#if BUILD_DOUBLE int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); - +#endif +#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); @@ -283,7 +318,8 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dgemm_itcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); - +#endif +#if BUILD_DOUBLE int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); @@ -335,7 +371,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); int (*dneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); - +#endif #ifdef EXPRECISION int qgemm_p, qgemm_q, qgemm_r; @@ -430,22 +466,29 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG); #endif +#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int cgemm_p, cgemm_q, cgemm_r; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; - float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG); BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); +#endif +#if BUILD_COMPLEX float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG); float (*csum_k) (BLASLONG, float *, BLASLONG); +#endif +#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); +#endif +#if BUILD_COMPLEX int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); - +#endif +#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16) int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -459,6 +502,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if (BUILD_COMPLEX) int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); @@ -470,13 +515,14 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); +#endif +#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); @@ -507,6 +553,8 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); +#endif +#if (BUILD_COMPLEX) int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -590,10 +638,13 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); - +#endif +#if (BUILD_COMPLEX) || (BUILD_COMPLEX16) int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); +#endif +#if BUILD_COMPLEX16 int zgemm_p, zgemm_q, zgemm_r; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; @@ -757,6 +808,7 @@ BLASLONG (*izamin_k)(BLASLONG, double *, BLASLONG); int (*zneg_tcopy) (BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*zlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, double *, BLASLONG, blasint *, double *); +#endif #ifdef EXPRECISION @@ -930,22 +982,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); void (*init)(void); int snum_opt, dnum_opt, qnum_opt; - +#if BUILD_SINGLE int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); +#endif +#if BUILD_DOUBLE int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); +#endif +#if BUILD_COMPLEX int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); +#endif +#if BUILD_SINGLE int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); +#endif +#if BUILD_DOUBLE int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); +#endif +#if BUILD_COMPLEX int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); @@ -955,7 +1019,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*comatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); @@ -965,17 +1031,23 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zomatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); +#endif +#if BUILD_SINGLE int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); +#endif +#if BUILD_DOUBLE int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); +#endif +#if BUILD_COMPLEX int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); @@ -985,7 +1057,9 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*cimatcopy_k_ctc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rnc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); @@ -995,12 +1069,20 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG); int (*zimatcopy_k_ctc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rnc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); +#endif +#if BUILD_SINGLE int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); +#endif +#if BUILD_DOUBLE int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); +#endif +#if BUILD_COMPLEX int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); +#endif +#if BUILD_COMPLEX16 int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); - +#endif } gotoblas_t; extern gotoblas_t *gotoblas; @@ -1021,19 +1103,31 @@ extern gotoblas_t *gotoblas; #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn #endif +#if (BUILD_SINGLE) #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_R gotoblas -> sgemm_r #define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m #define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#if (BUILD_DOUBLE) #define DGEMM_P gotoblas -> dgemm_p #define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_R gotoblas -> dgemm_r #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#if ! (BUILD_SINGLE) +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R gotoblas -> sgemm_r +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#endif #define QGEMM_P gotoblas -> qgemm_p #define QGEMM_Q gotoblas -> qgemm_q @@ -1042,19 +1136,47 @@ extern gotoblas_t *gotoblas; #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn +#if BUILD_COMPLEX #define CGEMM_P gotoblas -> cgemm_p #define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_R gotoblas -> cgemm_r #define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m #define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n #define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#ifndef BUILD_SINGLE +#define SGEMM_P gotoblas -> sgemm_p +#define SGEMM_Q gotoblas -> sgemm_q +#define SGEMM_R 1024 +#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m +#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n +#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn +#endif +#endif +#if BUILD_COMPLEX16 #define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_R gotoblas -> zgemm_r #define ZGEMM_UNROLL_M gotoblas -> zgemm_unroll_m #define ZGEMM_UNROLL_N gotoblas -> zgemm_unroll_n #define ZGEMM_UNROLL_MN gotoblas -> zgemm_unroll_mn +#ifndef BUILD_DOUBLE +#define DGEMM_P gotoblas -> dgemm_p +#define DGEMM_Q gotoblas -> dgemm_q +#define DGEMM_R 1024 +#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m +#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n +#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn +#endif +#ifndef BUILD_COMPLEX +#define CGEMM_P gotoblas -> cgemm_p +#define CGEMM_Q gotoblas -> cgemm_q +#define CGEMM_R gotoblas -> cgemm_r +#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m +#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n +#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn +#endif +#endif #define XGEMM_P gotoblas -> xgemm_p #define XGEMM_Q gotoblas -> xgemm_q @@ -1222,7 +1344,7 @@ extern gotoblas_t *gotoblas; #endif #ifndef COMPLEX -#if defined(XDOUBLE) +#if (XDOUBLE) #define GEMM_P QGEMM_P #define GEMM_Q QGEMM_Q #define GEMM_R QGEMM_R @@ -1246,7 +1368,7 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N -#elif defined(HALF) +#elif (HALF) #define GEMM_P SHGEMM_P #define GEMM_Q SHGEMM_Q #define GEMM_R SHGEMM_R @@ -1272,7 +1394,7 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N #endif #else -#if defined(XDOUBLE) +#if (XDOUBLE) #define GEMM_P XGEMM_P #define GEMM_Q XGEMM_Q #define GEMM_R XGEMM_R @@ -1386,7 +1508,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_P #ifdef XDOUBLE #define GEMM3M_P XGEMM3M_P -#elif defined(DOUBLE) +#elif defined (DOUBLE) #define GEMM3M_P ZGEMM3M_P #else #define GEMM3M_P CGEMM3M_P @@ -1396,7 +1518,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_Q #ifdef XDOUBLE #define GEMM3M_Q XGEMM3M_Q -#elif defined(DOUBLE) +#elif defined (DOUBLE) #define GEMM3M_Q ZGEMM3M_Q #else #define GEMM3M_Q CGEMM3M_Q @@ -1406,7 +1528,7 @@ extern gotoblas_t *gotoblas; #ifndef GEMM3M_R #ifdef XDOUBLE #define GEMM3M_R XGEMM3M_R -#elif defined(DOUBLE) +#elif defined (DOUBLE) #define GEMM3M_R ZGEMM3M_R #else #define GEMM3M_R CGEMM3M_R