diff --git a/CMakeLists.txt b/CMakeLists.txt index c2b9ae7ad..70760d64d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -86,10 +86,13 @@ if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () +if (NOT DEFINED BUILD_HALF) + set (BUILD_HALF false) +endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all - set(BUILD_HALF true) +# set(BUILD_HALF true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -121,7 +124,7 @@ if (BUILD_COMPLEX16) list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () -if (BUILD_SINGLE OR BUILD_HALF) +if (BUILD_HALF) message(STATUS "Building Half Precision") list(APPEND FLOAT_TYPES "HALF") # defines nothing endif () diff --git a/Makefile.rule b/Makefile.rule index 724a60ec4..8549e6394 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -273,6 +273,9 @@ COMMON_PROF = -pg # # CPP_THREAD_SAFETY_TEST = 1 + +# If you want to enable the experimental BFLOAT16 support +# BUILD_HALF = 1 # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index ce071133d..76d755ec2 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1124,6 +1124,10 @@ ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif +ifeq ($(BUILD_HALF), 1) +CCOMMON_OPT += -DBUILD_HALF +endif + CCOMMON_OPT += -DVERSION=\"$(VERSION)\" ifndef SYMBOLPREFIX @@ -1395,6 +1399,7 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 +export BUILD_HALF export SHGEMM_UNROLL_M export SHGEMM_UNROLL_N diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 1c1fed571..4b505a102 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -113,6 +113,7 @@ macro(SetDefaultL1) set(ZSUMKERNEL zsum.S) set(QSUMKERNEL sum.S) set(XSUMKERNEL zsum.S) +if (BUILD_HALF) set(SHAMINKERNEL ../arm/amin.c) set(SHAMAXKERNEL ../arm/amax.c) set(SHMAXKERNEL ../arm/max.c) @@ -131,6 +132,7 @@ macro(SetDefaultL1) set(SHNRM2KERNEL ../arm/nrm2.c) set(SHSUMKERNEL ../arm/sum.c) set(SHSWAPKERNEL ../arm/swap.c) +endif () endmacro () macro(SetDefaultL2) @@ -179,10 +181,11 @@ macro(SetDefaultL2) set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) +if (BUILD_HALF) set(SHGEMVNKERNEL ../arm/gemv_n.c) set(SHGEMVTKERNEL ../arm/gemv_t.c) set(SHGERKERNEL ../generic/ger.c) - +endif () endmacro () macro(SetDefaultL3) @@ -190,6 +193,7 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) +if (BUILD_HALF) set(SHGEADD_KERNEL ../generic/geadd.c) set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) set(SHGEMM_BETA ../generic/gemm_beta.c) @@ -201,6 +205,6 @@ macro(SetDefaultL3) set(SHGEMMITCOPYOBJ shgemm_itcopy.o) set(SHGEMMONCOPYOBJ shgemm_oncopy.o) set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) - +endif () endmacro () diff --git a/common_param.h b/common_param.h index 19a34fa3d..c92609a76 100644 --- a/common_param.h +++ b/common_param.h @@ -47,7 +47,7 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; -#if 1 +#ifdef BUILD_HALF int shgemm_p, shgemm_q, shgemm_r; int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; @@ -1002,12 +1002,14 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache +#ifdef BUILD_HALF #define SHGEMM_P gotoblas -> shgemm_p #define SHGEMM_Q gotoblas -> shgemm_q #define SHGEMM_R gotoblas -> shgemm_r #define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m #define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n #define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn +#endif #define SGEMM_P gotoblas -> sgemm_p #define SGEMM_Q gotoblas -> sgemm_q @@ -1086,6 +1088,7 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif +#ifdef BUILD_HALF #define SHGEMM_P SHGEMM_DEFAULT_P #define SHGEMM_Q SHGEMM_DEFAULT_Q #define SHGEMM_R SHGEMM_DEFAULT_R @@ -1096,6 +1099,7 @@ extern gotoblas_t *gotoblas; #else #define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) #endif +#endif #define SGEMM_P SGEMM_DEFAULT_P #define SGEMM_Q SGEMM_DEFAULT_Q @@ -1330,31 +1334,31 @@ extern gotoblas_t *gotoblas; #endif #ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15) +#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef SGEMM_DEFAULT_R -#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15) +#define SGEMM_DEFAULT_R (((BUFFER_SIZE - ((SGEMM_DEFAULT_P * SGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef DGEMM_DEFAULT_R -#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define DGEMM_DEFAULT_R (((BUFFER_SIZE - ((DGEMM_DEFAULT_P * DGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (DGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef QGEMM_DEFAULT_R -#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define QGEMM_DEFAULT_R (((BUFFER_SIZE - ((QGEMM_DEFAULT_P * QGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (QGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef CGEMM_DEFAULT_R -#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15) +#define CGEMM_DEFAULT_R (((BUFFER_SIZE - ((CGEMM_DEFAULT_P * CGEMM_DEFAULT_Q * 8 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (CGEMM_DEFAULT_Q * 8) - 15) & ~15UL) #endif #ifndef ZGEMM_DEFAULT_R -#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15) +#define ZGEMM_DEFAULT_R (((BUFFER_SIZE - ((ZGEMM_DEFAULT_P * ZGEMM_DEFAULT_Q * 16 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (ZGEMM_DEFAULT_Q * 16) - 15) & ~15UL) #endif #ifndef XGEMM_DEFAULT_R -#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15) +#define XGEMM_DEFAULT_R (((BUFFER_SIZE - ((XGEMM_DEFAULT_P * XGEMM_DEFAULT_Q * 32 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (XGEMM_DEFAULT_Q * 32) - 15) & ~15UL) #endif #ifndef SNUMOPT diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 881b4ee35..09a62d9bf 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -19,7 +19,10 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif +ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) +endif + SBLASOBJS += \ sgemm_nn.$(SUFFIX) sgemm_nt.$(SUFFIX) sgemm_tn.$(SUFFIX) sgemm_tt.$(SUFFIX) \ strmm_LNUU.$(SUFFIX) strmm_LNUN.$(SUFFIX) strmm_LNLU.$(SUFFIX) strmm_LNLN.$(SUFFIX) \ @@ -204,8 +207,9 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 - +ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) +endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) QBLASOBJS += qgemm_thread_nn.$(SUFFIX) qgemm_thread_nt.$(SUFFIX) qgemm_thread_tn.$(SUFFIX) qgemm_thread_tt.$(SUFFIX) diff --git a/exports/Makefile b/exports/Makefile index 60291b1ff..c92d6e996 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -30,6 +30,10 @@ ifndef BUILD_LAPACK_DEPRECATED BUILD_LAPACK_DEPRECATED = 0 endif +ifndef BUILD_HALF +BUILD_HALF = 0 +endif + ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) ifndef ONLY_CBLAS @@ -234,23 +238,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index 235446f14..0a68a3572 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -30,7 +30,7 @@ icamax,icamin,idamax,idamin,idmax,idmin,isamax,isamin,ismax,ismin, izamax,izamin,lsame,samax,samin,sasum,saxpy,scabs1,scamax, scamin,scasum,scnrm2,scopy,sdot,sdsdot,sgbmv,sgemm,sgemv,sger, - shgemm, smax,smin,snrm2, + smax,smin,snrm2, srot,srotg,srotm,srotmg,ssbmv,sscal,sspmv,sspr2,sspr,sswap, ssymm,ssymv,ssyr2,ssyr2k,ssyr,ssyrk,stbmv,stbsv,stpmv,stpsv, strmm,strmv,strsm,strsv,zaxpy,zcopy,zdotc,zdotu,zdrot, @@ -51,6 +51,7 @@ zimatcopy, ); +@halfblasobjs = (shgemm); @cblasobjs = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -67,7 +68,7 @@ cblas_isamax, cblas_izamax, cblas_sasum, cblas_saxpy, cblas_scasum, cblas_scnrm2, cblas_scopy, cblas_sdot, cblas_sdsdot, cblas_sgbmv, cblas_sgemm, - cblas_sgemv, cblas_sger, cblas_shgemm, cblas_snrm2, cblas_srot, cblas_srotg, + cblas_sgemv, cblas_sger, cblas_snrm2, cblas_srot, cblas_srotg, cblas_srotm, cblas_srotmg, cblas_ssbmv, cblas_sscal, cblas_sspmv, cblas_sspr2, cblas_sspr, cblas_sswap, cblas_ssymm, cblas_ssymv, cblas_ssyr2, cblas_ssyr2k, cblas_ssyr, cblas_ssyrk, cblas_stbmv, cblas_stbsv, cblas_stpmv, cblas_stpsv, cblas_strmm, cblas_strmv, cblas_strsm, @@ -83,6 +84,8 @@ cblas_sgeadd, cblas_dgeadd,cblas_cgeadd, cblas_zgeadd ); +@halfcblasobjs = (cblas_shgemm); + @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qgemv,qger,qmax,qmin, @@ -3454,6 +3457,10 @@ use File::Spec; use File::Basename; my $dirname = File::Spec->catfile(dirname(dirname(File::Spec->rel2abs(__FILE__))), "lapack-netlib"); +if ($ARGV[12] == 1) { + @blasobjs = (@blasobjs, @halfblasobjs); + @cblasobjs = (@cblasobjs, @halfcblasobjs); +} if ($ARGV[8] == 1) { #ONLY_CBLAS=1 @underscore_objs = (@misc_underscore_objs); diff --git a/interface/Makefile b/interface/Makefile index 741f6bac0..44a9fdcf0 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -46,7 +46,9 @@ SBLAS3OBJS = \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ sgeadd.$(SUFFIX) +ifeq ($(BUILD_HALF),1) SHBLAS3OBJS = shgemm.$(SUFFIX) +endif DBLAS1OBJS = \ daxpy.$(SUFFIX) dswap.$(SUFFIX) \ @@ -278,7 +280,9 @@ CSBLAS3OBJS = \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_sgeadd.$(SUFFIX) +ifeq ($(BUILD_HALF),1) CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) +endif CDBLAS1OBJS = \ cblas_idamax.$(SUFFIX) cblas_idamin.$(SUFFIX) cblas_dasum.$(SUFFIX) cblas_daxpy.$(SUFFIX) \ @@ -1214,8 +1218,10 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) +ifeq ($(BUILD_HALF),1) shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) +endif sgemm.$(SUFFIX) sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) @@ -1778,8 +1784,10 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +ifeq ($(BUILD_HALF),1) cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif cblas_dgemm.$(SUFFIX) cblas_dgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index 4113a1647..b114c6a33 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -137,7 +137,11 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type SINGLE DOUBLE HALF) string(SUBSTRING ${float_type} 0 1 float_char) if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (NOT ${BUILD_HALF}) + continue () + else () + set (float_char "SH") + endif () endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) endforeach() diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index baf0c1c8a..da6c5fd57 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -59,7 +59,8 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif -#ifndef SHGEMMKERNEL +ifeq ($(BUILD_HALF), 1) +ifndef SHGEMMKERNEL SHGEMM_BETA = ../generic/gemm_beta.c SHGEMMKERNEL = ../generic/gemmkernel_2x2.c SHGEMMINCOPY = ../generic/gemm_ncopy_2.c @@ -70,12 +71,13 @@ SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) -#endif +endif SHKERNELOBJS += \ shgemm_kernel$(TSUFFIX).$(SUFFIX) \ $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) +endif SKERNELOBJS += \ sgemm_kernel$(TSUFFIX).$(SUFFIX) \ @@ -110,7 +112,9 @@ XKERNELOBJS += \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) +ifeq ($(BUILD_HALF),1) SHBLASOBJS += $(SHKERNELOBJS) +endif SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) QBLASOBJS += $(QKERNELOBJS) @@ -118,7 +122,10 @@ CBLASOBJS += $(CKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) +ifeq ($(BUILD_HALF),1) SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) +endif + SBLASOBJS += \ sgemm_beta$(TSUFFIX).$(SUFFIX) \ strmm_kernel_LN$(TSUFFIX).$(SUFFIX) strmm_kernel_LT$(TSUFFIX).$(SUFFIX) \ @@ -408,11 +415,13 @@ ZBLASOBJS += \ zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) - +ifeq ($(BUILD_HALF), 1) SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +endif + SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMITCOPYOBJ_P = $(SGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMONCOPYOBJ_P = $(SGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -438,8 +447,10 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ifeq ($(BUILD_HALF),1) $(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -459,10 +470,14 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +ifeq ($(BUILD_HALF), 1) + $(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) + ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemmotcopy.s m4 shgemmotcopy.s > shgemmotcopy_nomacros.s @@ -487,6 +502,7 @@ else $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ endif +endif endif $(KDIR)$(SGEMMONCOPYOBJ) : $(KERNELDIR)/$(SGEMMONCOPY) @@ -646,6 +662,8 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif +ifeq ($(BUILD_HALF), 1) + $(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) ifeq ($(OS), AIX) $(CC) $(CFLAGS) -E -DHALF -UDOUBLE -UCOMPLEX $< -o shgemm_kernel$(TSUFFIX).s @@ -655,6 +673,7 @@ ifeq ($(OS), AIX) else $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ endif +endif $(KDIR)dgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(DGEMMDEPEND) ifeq ($(OS), AIX) @@ -2272,8 +2291,10 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_HALF),1) $(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(CC) $(PFLAGS) -c -DDOUBLE -UCOMPLEX $< -o $@ @@ -2290,6 +2311,8 @@ $(KDIR)zgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(ZGEMM_BETA) $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ + +ifeq ($(BUILD_HALF), 1) $(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ @@ -2304,6 +2327,8 @@ $(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ endif +endif + $(SGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SGEMMONCOPY) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ @@ -2408,8 +2433,11 @@ endif endif + +ifeq ($(BUILD_HALF), 1) $(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +endif $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index b7cf0f112..d3aa030c1 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -53,6 +53,7 @@ gotoblas_t TABLE_NAME = { GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, +#ifdef BUILD_HALF 0, 0, 0, SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, #ifdef SHGEMM_DEFAULT_UNROLL_MN @@ -109,7 +110,7 @@ gotoblas_t TABLE_NAME = { #else NULL,NULL, #endif - +#endif 0, 0, 0, SGEMM_DEFAULT_UNROLL_M, SGEMM_DEFAULT_UNROLL_N, @@ -706,19 +707,25 @@ gotoblas_t TABLE_NAME = { #if defined(ARCH_ARM64) static void init_parameter(void) { +#if defined(BUILD_HALF) TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#if defined(BUILD_HALF) TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; +#if defined(BUILD_HALF) TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; @@ -782,20 +789,26 @@ static void init_parameter(void) { #if defined(ARCH_POWER) static void init_parameter(void) { +#ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef BUILD_HALF TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#ifdef BUILD_HALF TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; @@ -805,20 +818,26 @@ static void init_parameter(void) { #if defined(ARCH_ZARCH) static void init_parameter(void) { +#ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef BUILD_HALF TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; +#ifdef BUILD_HALF TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; @@ -958,9 +977,11 @@ static void init_parameter(void) { (void) l2; /* dirty trick to suppress unused variable warning for targets */ /* where the GEMM unrolling parameters do not depend on l2 */ +#ifdef BUILD_HALF TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;