diff --git a/CMakeLists.txt b/CMakeLists.txt index c324e2241..7bfd74cc1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -87,13 +87,13 @@ if (NOT NO_LAPACK) list(APPEND SUBDIRS lapack) endif () -if (NOT DEFINED BUILD_HALF) - set (BUILD_HALF false) +if (NOT DEFINED BUILD_BFLOAT16) + set (BUILD_BFLOAT16 false) endif () # set which float types we want to build for if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) # if none are defined, build for all -# set(BUILD_HALF true) +# set(BUILD_BFLOAT16 true) set(BUILD_SINGLE true) set(BUILD_DOUBLE true) set(BUILD_COMPLEX true) @@ -125,9 +125,9 @@ if (BUILD_COMPLEX16) list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE endif () -if (BUILD_HALF) - message(STATUS "Building Half Precision") - list(APPEND FLOAT_TYPES "HALF") # defines nothing +if (BUILD_BFLOAT16) + message(STATUS "Building BFloat16 Precision") + list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing endif () if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") diff --git a/Makefile.rule b/Makefile.rule index 2c12177ee..afec6eab3 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -275,7 +275,7 @@ COMMON_PROF = -pg # If you want to enable the experimental BFLOAT16 support -# BUILD_HALF = 1 +# BUILD_BFLOAT16 = 1 # # End of user configuration # diff --git a/Makefile.system b/Makefile.system index d7e71d00a..9d13e8d3a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -1209,8 +1209,8 @@ ifeq ($(USE_TLS), 1) CCOMMON_OPT += -DUSE_TLS endif -ifeq ($(BUILD_HALF), 1) -CCOMMON_OPT += -DBUILD_HALF +ifeq ($(BUILD_BFLOAT16), 1) +CCOMMON_OPT += -DBUILD_BFLOAT16 endif CCOMMON_OPT += -DVERSION=\"$(VERSION)\" @@ -1486,10 +1486,10 @@ export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE export NO_AVX512 -export BUILD_HALF +export BUILD_BFLOAT16 -export SHGEMM_UNROLL_M -export SHGEMM_UNROLL_N +export SBGEMM_UNROLL_M +export SBGEMM_UNROLL_N export SGEMM_UNROLL_M export SGEMM_UNROLL_N export DGEMM_UNROLL_M diff --git a/Makefile.tail b/Makefile.tail index 39902982b..92152ac54 100644 --- a/Makefile.tail +++ b/Makefile.tail @@ -1,4 +1,4 @@ -SHBLASOBJS_P = $(SHBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) +SBBLASOBJS_P = $(SBBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) SBLASOBJS_P = $(SBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) DBLASOBJS_P = $(DBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) @@ -10,8 +10,8 @@ COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) -BLASOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -BLASOBJS_P = $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) +BLASOBJS = $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +BLASOBJS_P = $(SBBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) ifdef EXPRECISION BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -23,7 +23,7 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) endif -$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX +$(SBBLASOBJS) $(SBBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX @@ -31,7 +31,7 @@ $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX -$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) +$(SBBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(DBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) diff --git a/benchmark/Makefile b/benchmark/Makefile index 2f70ceaf3..8aa6f89c3 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -49,10 +49,10 @@ else GOTO_LAPACK_TARGETS= endif -ifeq ($(BUILD_HALF),1) -GOTO_HALF_TARGETS=shgemm.goto +ifeq ($(BUILD_BFLOAT16),1) +GOTO_BFLOAT16_TARGETS=sbgemm.goto else -GOTO_HALF_TARGETS= +GOTO_BFLOAT16_TARGETS= endif ifeq ($(OSNAME), WINNT) @@ -97,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_BFLOAT16_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -270,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_BFLOAT16_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -620,8 +620,8 @@ zcholesky.essl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### -ifeq ($(BUILD_HALF),1) -shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm endif @@ -2927,9 +2927,9 @@ ccholesky.$(SUFFIX) : cholesky.c zcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ -ifeq ($(BUILD_HALF),1) -shgemm.$(SUFFIX) : gemm.c - $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DBFLOAT16 -UCOMPLEX -UDOUBLE -o $(@F) $^ endif sgemm.$(SUFFIX) : gemm.c diff --git a/benchmark/gemm.c b/benchmark/gemm.c index d2235330b..195d17d33 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -39,8 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) -#elif defined(HALF) -#define GEMM BLASFUNC(shgemm) +#elif defined(BFLOAT16) +#define GEMM BLASFUNC(sbgemm) #else #define GEMM BLASFUNC(sgemm) #endif diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 4b505a102..87a3dcb2c 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -113,7 +113,7 @@ macro(SetDefaultL1) set(ZSUMKERNEL zsum.S) set(QSUMKERNEL sum.S) set(XSUMKERNEL zsum.S) -if (BUILD_HALF) +if (BUILD_BFLOAT16) set(SHAMINKERNEL ../arm/amin.c) set(SHAMAXKERNEL ../arm/amax.c) set(SHMAXKERNEL ../arm/max.c) @@ -181,7 +181,7 @@ macro(SetDefaultL2) set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) -if (BUILD_HALF) +if (BUILD_BFLOAT16) set(SHGEMVNKERNEL ../arm/gemv_n.c) set(SHGEMVTKERNEL ../arm/gemv_t.c) set(SHGERKERNEL ../generic/ger.c) @@ -193,18 +193,18 @@ macro(SetDefaultL3) set(DGEADD_KERNEL ../generic/geadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c) -if (BUILD_HALF) +if (BUILD_BFLOAT16) set(SHGEADD_KERNEL ../generic/geadd.c) - set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) - set(SHGEMM_BETA ../generic/gemm_beta.c) - set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c) - set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c) - set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c) - set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c) - set(SHGEMMINCOPYOBJ shgemm_incopy.o) - set(SHGEMMITCOPYOBJ shgemm_itcopy.o) - set(SHGEMMONCOPYOBJ shgemm_oncopy.o) - set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) + set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c) + set(SBGEMM_BETA ../generic/gemm_beta.c) + set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c) + set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c) + set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c) + set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c) + set(SBGEMMINCOPYOBJ sbgemm_incopy.o) + set(SBGEMMITCOPYOBJ sbgemm_itcopy.o) + set(SBGEMMONCOPYOBJ sbgemm_oncopy.o) + set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o) endif () endmacro () diff --git a/cmake/prebuild.cmake b/cmake/prebuild.cmake index 3b2a9d6a2..f40304c09 100644 --- a/cmake/prebuild.cmake +++ b/cmake/prebuild.cmake @@ -16,8 +16,8 @@ # HAVE_SSE2 # HAVE_SSE3 # MAKE -# SHGEMM_UNROLL_M -# SHGEMM_UNROLL_N +# SBGEMM_UNROLL_M +# SBGEMM_UNROLL_N # SGEMM_UNROLL_M # SGEMM_UNROLL_N # DGEMM_UNROLL_M @@ -471,8 +471,8 @@ endif () set(ZGEMM_UNROLL_N 2) set(SYMV_P 8) endif() - set(SHGEMM_UNROLL_M 8) - set(SHGEMM_UNROLL_N 4) + set(SBGEMM_UNROLL_M 8) + set(SBGEMM_UNROLL_N 4) # Or should this actually be NUM_CORES? if (${NUM_THREADS} GREATER 0) diff --git a/cmake/system.cmake b/cmake/system.cmake index d8dcc3cf3..06eb34c8c 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -548,8 +548,8 @@ endif () #export FUNCTION_PROFILE #export TARGET_CORE # -#export SHGEMM_UNROLL_M -#export SHGEMM_UNROLL_N +#export SBGEMM_UNROLL_M +#export SBGEMM_UNROLL_N #export SGEMM_UNROLL_M #export SGEMM_UNROLL_N #export DGEMM_UNROLL_M diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 1c21e776e..8f25c1b27 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -211,7 +211,7 @@ function(GenerateNamedObjects sources_in) if (complex_only) list(REMOVE_ITEM float_list "SINGLE") list(REMOVE_ITEM float_list "DOUBLE") - list(REMOVE_ITEM float_list "HALF") + list(REMOVE_ITEM float_list "BFLOAT16") elseif (real_only) list(REMOVE_ITEM float_list "COMPLEX") list(REMOVE_ITEM float_list "ZCOMPLEX") @@ -225,8 +225,8 @@ function(GenerateNamedObjects sources_in) if (NOT no_float_type) string(SUBSTRING ${float_type} 0 1 float_char) string(TOLOWER ${float_char} float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "sh") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "sb") endif () endif () @@ -262,8 +262,8 @@ function(GenerateNamedObjects sources_in) if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "DOUBLE") endif () - if (${float_type} STREQUAL "HALF") - list(APPEND obj_defines "HALF") + if (${float_type} STREQUAL "BFLOAT16") + list(APPEND obj_defines "BFLOAT16") endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") list(APPEND obj_defines "COMPLEX") diff --git a/common.h b/common.h index d6637abe4..ef2a42985 100644 --- a/common.h +++ b/common.h @@ -257,9 +257,9 @@ typedef long BLASLONG; typedef unsigned long BLASULONG; #endif -#ifndef BFLOAT16 +#ifndef BUILD_IN_BFLOAT16 typedef unsigned short bfloat16; -#define HALFCONVERSION 1 +#define BFLOAT16CONVERSION 1 #endif #ifdef USE64BITINT @@ -302,7 +302,7 @@ typedef int blasint; #define SIZE 8 #define BASE_SHIFT 3 #define ZBASE_SHIFT 4 -#elif defined(HALF) +#elif defined(BFLOAT16) #define IFLOAT bfloat16 #define XFLOAT IFLOAT #define FLOAT float diff --git a/common_interface.h b/common_interface.h index 78f5be6b0..fddc06184 100644 --- a/common_interface.h +++ b/common_interface.h @@ -469,7 +469,7 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint /* Level 3 routines */ -void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, +void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *, bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); diff --git a/common_level3.h b/common_level3.h index 4e44a5e73..36b2f168c 100644 --- a/common_level3.h +++ b/common_level3.h @@ -55,7 +55,7 @@ extern void sgemm_kernel_direct(BLASLONG M, BLASLONG N, BLASLONG K, extern int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); -int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, +int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); @@ -78,10 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); #endif -int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); -int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); +int sbgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); @@ -505,7 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); -int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); +int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); @@ -534,10 +534,10 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); -int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); @@ -631,10 +631,10 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); #endif -int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); -int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); +int sbgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); diff --git a/common_macro.h b/common_macro.h index 8fe1f156f..02a7477b9 100644 --- a/common_macro.h +++ b/common_macro.h @@ -39,7 +39,7 @@ #ifndef COMMON_MACRO #define COMMON_MACRO -#include "common_sh.h" +#include "common_sb.h" #include "common_s.h" #include "common_d.h" #include "common_q.h" @@ -644,7 +644,7 @@ #define GEADD_K DGEADD_K -#elif defined(HALF) +#elif defined(BFLOAT16) #define AMAX_K SAMAX_K #define AMIN_K SAMIN_K @@ -676,32 +676,32 @@ #define NRM2_K SNRM2_K #define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_L SSYMV_THREAD_L -#define GEMM_BETA SHGEMM_BETA -#define GEMM_KERNEL_N SHGEMM_KERNEL -#define GEMM_KERNEL_L SHGEMM_KERNEL -#define GEMM_KERNEL_R SHGEMM_KERNEL -#define GEMM_KERNEL_B SHGEMM_KERNEL +#define GEMM_BETA SBGEMM_BETA +#define GEMM_KERNEL_N SBGEMM_KERNEL +#define GEMM_KERNEL_L SBGEMM_KERNEL +#define GEMM_KERNEL_R SBGEMM_KERNEL +#define GEMM_KERNEL_B SBGEMM_KERNEL -#define GEMM_NN SHGEMM_NN -#define GEMM_CN SHGEMM_TN -#define GEMM_TN SHGEMM_TN -#define GEMM_NC SHGEMM_NT -#define GEMM_NT SHGEMM_NT -#define GEMM_CC SHGEMM_TT -#define GEMM_CT SHGEMM_TT -#define GEMM_TC SHGEMM_TT -#define GEMM_TT SHGEMM_TT -#define GEMM_NR SHGEMM_NN -#define GEMM_TR SHGEMM_TN -#define GEMM_CR SHGEMM_TN -#define GEMM_RN SHGEMM_NN -#define GEMM_RT SHGEMM_NT -#define GEMM_RC SHGEMM_NT -#define GEMM_RR SHGEMM_NN -#define GEMM_ONCOPY SHGEMM_ONCOPY -#define GEMM_OTCOPY SHGEMM_OTCOPY -#define GEMM_INCOPY SHGEMM_INCOPY -#define GEMM_ITCOPY SHGEMM_ITCOPY +#define GEMM_NN SBGEMM_NN +#define GEMM_CN SBGEMM_TN +#define GEMM_TN SBGEMM_TN +#define GEMM_NC SBGEMM_NT +#define GEMM_NT SBGEMM_NT +#define GEMM_CC SBGEMM_TT +#define GEMM_CT SBGEMM_TT +#define GEMM_TC SBGEMM_TT +#define GEMM_TT SBGEMM_TT +#define GEMM_NR SBGEMM_NN +#define GEMM_TR SBGEMM_TN +#define GEMM_CR SBGEMM_TN +#define GEMM_RN SBGEMM_NN +#define GEMM_RT SBGEMM_NT +#define GEMM_RC SBGEMM_NT +#define GEMM_RR SBGEMM_NN +#define GEMM_ONCOPY SBGEMM_ONCOPY +#define GEMM_OTCOPY SBGEMM_OTCOPY +#define GEMM_INCOPY SBGEMM_INCOPY +#define GEMM_ITCOPY SBGEMM_ITCOPY #define SYMM_THREAD_LU SSYMM_THREAD_LU #define SYMM_THREAD_LL SSYMM_THREAD_LL #define SYMM_THREAD_RU SSYMM_THREAD_RU @@ -717,22 +717,22 @@ #define HEMM_THREAD_RU SHEMM_THREAD_RU #define HEMM_THREAD_RL SHEMM_THREAD_RL -#define GEMM_THREAD_NN SHGEMM_THREAD_NN -#define GEMM_THREAD_CN SHGEMM_THREAD_TN -#define GEMM_THREAD_TN SHGEMM_THREAD_TN -#define GEMM_THREAD_NC SHGEMM_THREAD_NT -#define GEMM_THREAD_NT SHGEMM_THREAD_NT -#define GEMM_THREAD_CC SHGEMM_THREAD_TT -#define GEMM_THREAD_CT SHGEMM_THREAD_TT -#define GEMM_THREAD_TC SHGEMM_THREAD_TT -#define GEMM_THREAD_TT SHGEMM_THREAD_TT -#define GEMM_THREAD_NR SHGEMM_THREAD_NN -#define GEMM_THREAD_TR SHGEMM_THREAD_TN -#define GEMM_THREAD_CR SHGEMM_THREAD_TN -#define GEMM_THREAD_RN SHGEMM_THREAD_NN -#define GEMM_THREAD_RT SHGEMM_THREAD_NT -#define GEMM_THREAD_RC SHGEMM_THREAD_NT -#define GEMM_THREAD_RR SHGEMM_THREAD_NN +#define GEMM_THREAD_NN SBGEMM_THREAD_NN +#define GEMM_THREAD_CN SBGEMM_THREAD_TN +#define GEMM_THREAD_TN SBGEMM_THREAD_TN +#define GEMM_THREAD_NC SBGEMM_THREAD_NT +#define GEMM_THREAD_NT SBGEMM_THREAD_NT +#define GEMM_THREAD_CC SBGEMM_THREAD_TT +#define GEMM_THREAD_CT SBGEMM_THREAD_TT +#define GEMM_THREAD_TC SBGEMM_THREAD_TT +#define GEMM_THREAD_TT SBGEMM_THREAD_TT +#define GEMM_THREAD_NR SBGEMM_THREAD_NN +#define GEMM_THREAD_TR SBGEMM_THREAD_TN +#define GEMM_THREAD_CR SBGEMM_THREAD_TN +#define GEMM_THREAD_RN SBGEMM_THREAD_NN +#define GEMM_THREAD_RT SBGEMM_THREAD_NT +#define GEMM_THREAD_RC SBGEMM_THREAD_NT +#define GEMM_THREAD_RR SBGEMM_THREAD_NN #ifdef UNIT @@ -2485,9 +2485,9 @@ #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_b; -extern BLASLONG shgemm_p; -extern BLASLONG shgemm_q; -extern BLASLONG shgemm_r; +extern BLASLONG sbgemm_p; +extern BLASLONG sbgemm_q; +extern BLASLONG sbgemm_r; extern BLASLONG sgemm_p; extern BLASLONG sgemm_q; extern BLASLONG sgemm_r; diff --git a/common_param.h b/common_param.h index c92609a76..7f7873a9d 100644 --- a/common_param.h +++ b/common_param.h @@ -47,9 +47,9 @@ typedef struct { int dtb_entries; int offsetA, offsetB, align; -#ifdef BUILD_HALF - int shgemm_p, shgemm_q, shgemm_r; - int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; +#ifdef BUILD_BFLOAT16 + int sbgemm_p, sbgemm_q, sbgemm_r; + int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; float (*shamax_k) (BLASLONG, float *, BLASLONG); float (*shamin_k) (BLASLONG, float *, BLASLONG); @@ -80,13 +80,13 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); - int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); + int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); + int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); - int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); - int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); + int (*sbgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); @@ -1002,13 +1002,13 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 gotoblas -> exclusive_cache -#ifdef BUILD_HALF -#define SHGEMM_P gotoblas -> shgemm_p -#define SHGEMM_Q gotoblas -> shgemm_q -#define SHGEMM_R gotoblas -> shgemm_r -#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m -#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n -#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn +#ifdef BUILD_BFLOAT16 +#define SBGEMM_P gotoblas -> sbgemm_p +#define SBGEMM_Q gotoblas -> sbgemm_q +#define SBGEMM_R gotoblas -> sbgemm_r +#define SBGEMM_UNROLL_M gotoblas -> sbgemm_unroll_m +#define SBGEMM_UNROLL_N gotoblas -> sbgemm_unroll_n +#define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn #endif #define SGEMM_P gotoblas -> sgemm_p @@ -1088,16 +1088,16 @@ extern gotoblas_t *gotoblas; #define HAVE_EX_L2 0 #endif -#ifdef BUILD_HALF -#define SHGEMM_P SHGEMM_DEFAULT_P -#define SHGEMM_Q SHGEMM_DEFAULT_Q -#define SHGEMM_R SHGEMM_DEFAULT_R -#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N -#ifdef SHGEMM_DEFAULT_UNROLL_MN -#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN +#ifdef BUILD_BFLOAT16 +#define SBGEMM_P SBGEMM_DEFAULT_P +#define SBGEMM_Q SBGEMM_DEFAULT_Q +#define SBGEMM_R SBGEMM_DEFAULT_R +#define SBGEMM_UNROLL_M SBGEMM_DEFAULT_UNROLL_M +#define SBGEMM_UNROLL_N SBGEMM_DEFAULT_UNROLL_N +#ifdef SBGEMM_DEFAULT_UNROLL_MN +#define SBGEMM_UNROLL_MN SBGEMM_DEFAULT_UNROLL_MN #else -#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) +#define SBGEMM_UNROLL_MN MAX((SBGEMM_UNROLL_M), (SBGEMM_UNROLL_N)) #endif #endif @@ -1237,17 +1237,17 @@ extern gotoblas_t *gotoblas; #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N #elif defined(HALF) -#define GEMM_P SHGEMM_P -#define GEMM_Q SHGEMM_Q -#define GEMM_R SHGEMM_R -#define GEMM_UNROLL_M SHGEMM_UNROLL_M -#define GEMM_UNROLL_N SHGEMM_UNROLL_N -#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN -#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P -#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q -#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R -#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M -#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N +#define GEMM_P SBGEMM_P +#define GEMM_Q SBGEMM_Q +#define GEMM_R SBGEMM_R +#define GEMM_UNROLL_M SBGEMM_UNROLL_M +#define GEMM_UNROLL_N SBGEMM_UNROLL_N +#define GEMM_UNROLL_MN SBGEMM_UNROLL_MN +#define GEMM_DEFAULT_P SBGEMM_DEFAULT_P +#define GEMM_DEFAULT_Q SBGEMM_DEFAULT_Q +#define GEMM_DEFAULT_R SBGEMM_DEFAULT_R +#define GEMM_DEFAULT_UNROLL_M SBGEMM_DEFAULT_UNROLL_M +#define GEMM_DEFAULT_UNROLL_N SBGEMM_DEFAULT_UNROLL_N #else #define GEMM_P SGEMM_P #define GEMM_Q SGEMM_Q @@ -1333,8 +1333,8 @@ extern gotoblas_t *gotoblas; #define GEMM_THREAD gemm_thread_n #endif -#ifndef SHGEMM_DEFAULT_R -#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) +#ifndef SBGEMM_DEFAULT_R +#define SBGEMM_DEFAULT_R (((BUFFER_SIZE - ((SBGEMM_DEFAULT_P * SBGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SBGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #endif #ifndef SGEMM_DEFAULT_R diff --git a/common_sb.h b/common_sb.h new file mode 100644 index 000000000..fae8f5122 --- /dev/null +++ b/common_sb.h @@ -0,0 +1,65 @@ +#ifndef COMMON_SH_H +#define COMMON_SH_H + +#ifndef DYNAMIC_ARCH + +#define SBGEMM_ONCOPY sbgemm_oncopy +#define SBGEMM_OTCOPY sbgemm_otcopy + +#if SBGEMM_DEFAULT_UNROLL_M == sbgemm_DEFAULT_UNROLL_N +#define SBGEMM_INCOPY sbgemm_oncopy +#define SBGEMM_ITCOPY sbgemm_otcopy +#else +#define SBGEMM_INCOPY sbgemm_incopy +#define SBGEMM_ITCOPY sbgemm_itcopy +#endif +#define SBGEMM_BETA sbgemm_beta +#define SBGEMM_KERNEL sbgemm_kernel + +#else + +#define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy +#define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy +#define SBGEMM_INCOPY gotoblas -> sbgemm_incopy +#define SBGEMM_ITCOPY gotoblas -> sbgemm_itcopy +#define SBGEMM_BETA gotoblas -> sbgemm_beta +#define SBGEMM_KERNEL gotoblas -> sbgemm_kernel + +#endif + +#define SBGEMM_NN sbgemm_nn +#define SBGEMM_CN sbgemm_tn +#define SBGEMM_TN sbgemm_tn +#define SBGEMM_NC sbgemm_nt +#define SBGEMM_NT sbgemm_nt +#define SBGEMM_CC sbgemm_tt +#define SBGEMM_CT sbgemm_tt +#define SBGEMM_TC sbgemm_tt +#define SBGEMM_TT sbgemm_tt +#define SBGEMM_NR sbgemm_nn +#define SBGEMM_TR sbgemm_tn +#define SBGEMM_CR sbgemm_tn +#define SBGEMM_RN sbgemm_nn +#define SBGEMM_RT sbgemm_nt +#define SBGEMM_RC sbgemm_nt +#define SBGEMM_RR sbgemm_nn + +#define SBGEMM_THREAD_NN sbgemm_thread_nn +#define SBGEMM_THREAD_CN sbgemm_thread_tn +#define SBGEMM_THREAD_TN sbgemm_thread_tn +#define SBGEMM_THREAD_NC sbgemm_thread_nt +#define SBGEMM_THREAD_NT sbgemm_thread_nt +#define SBGEMM_THREAD_CC sbgemm_thread_tt +#define SBGEMM_THREAD_CT sbgemm_thread_tt +#define SBGEMM_THREAD_TC sbgemm_thread_tt +#define SBGEMM_THREAD_TT sbgemm_thread_tt +#define SBGEMM_THREAD_NR sbgemm_thread_nn +#define SBGEMM_THREAD_TR sbgemm_thread_tn +#define SBGEMM_THREAD_CR sbgemm_thread_tn +#define SBGEMM_THREAD_RN sbgemm_thread_nn +#define SBGEMM_THREAD_RT sbgemm_thread_nt +#define SBGEMM_THREAD_RC sbgemm_thread_nt +#define SBGEMM_THREAD_RR sbgemm_thread_nn + +#endif + diff --git a/common_sh.h b/common_sh.h deleted file mode 100644 index 7a0045762..000000000 --- a/common_sh.h +++ /dev/null @@ -1,65 +0,0 @@ -#ifndef COMMON_SH_H -#define COMMON_SH_H - -#ifndef DYNAMIC_ARCH - -#define SHGEMM_ONCOPY shgemm_oncopy -#define SHGEMM_OTCOPY shgemm_otcopy - -#if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N -#define SHGEMM_INCOPY shgemm_oncopy -#define SHGEMM_ITCOPY shgemm_otcopy -#else -#define SHGEMM_INCOPY shgemm_incopy -#define SHGEMM_ITCOPY shgemm_itcopy -#endif -#define SHGEMM_BETA shgemm_beta -#define SHGEMM_KERNEL shgemm_kernel - -#else - -#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy -#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy -#define SHGEMM_INCOPY gotoblas -> shgemm_incopy -#define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy -#define SHGEMM_BETA gotoblas -> shgemm_beta -#define SHGEMM_KERNEL gotoblas -> shgemm_kernel - -#endif - -#define SHGEMM_NN shgemm_nn -#define SHGEMM_CN shgemm_tn -#define SHGEMM_TN shgemm_tn -#define SHGEMM_NC shgemm_nt -#define SHGEMM_NT shgemm_nt -#define SHGEMM_CC shgemm_tt -#define SHGEMM_CT shgemm_tt -#define SHGEMM_TC shgemm_tt -#define SHGEMM_TT shgemm_tt -#define SHGEMM_NR shgemm_nn -#define SHGEMM_TR shgemm_tn -#define SHGEMM_CR shgemm_tn -#define SHGEMM_RN shgemm_nn -#define SHGEMM_RT shgemm_nt -#define SHGEMM_RC shgemm_nt -#define SHGEMM_RR shgemm_nn - -#define SHGEMM_THREAD_NN shgemm_thread_nn -#define SHGEMM_THREAD_CN shgemm_thread_tn -#define SHGEMM_THREAD_TN shgemm_thread_tn -#define SHGEMM_THREAD_NC shgemm_thread_nt -#define SHGEMM_THREAD_NT shgemm_thread_nt -#define SHGEMM_THREAD_CC shgemm_thread_tt -#define SHGEMM_THREAD_CT shgemm_thread_tt -#define SHGEMM_THREAD_TC shgemm_thread_tt -#define SHGEMM_THREAD_TT shgemm_thread_tt -#define SHGEMM_THREAD_NR shgemm_thread_nn -#define SHGEMM_THREAD_TR shgemm_thread_tn -#define SHGEMM_THREAD_CR shgemm_thread_tn -#define SHGEMM_THREAD_RN shgemm_thread_nn -#define SHGEMM_THREAD_RT shgemm_thread_nt -#define SHGEMM_THREAD_RC shgemm_thread_nt -#define SHGEMM_THREAD_RR shgemm_thread_nn - -#endif - diff --git a/driver/level3/Makefile b/driver/level3/Makefile index 09a62d9bf..fe70d2dc2 100644 --- a/driver/level3/Makefile +++ b/driver/level3/Makefile @@ -19,8 +19,8 @@ ifeq ($(ARCH), MIPS) USE_GEMM3M = 1 endif -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX) endif SBLASOBJS += \ @@ -207,8 +207,8 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$( COMMONOBJS += syrk_thread.$(SUFFIX) ifndef USE_SIMPLE_THREADED_LEVEL3 -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX) endif SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) @@ -289,17 +289,17 @@ endif all :: -shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) +sbgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) +sbgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) +sbgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) +sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -496,17 +496,17 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h beta_thread.$(SUFFIX) : beta_thread.c ../../common.h $(CC) -c $(CFLAGS) $< -o $(@F) -shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) +sbgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) +sbgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) +sbgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) +sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -2681,17 +2681,17 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c xtrsm_RCLN.$(SUFFIX) : trsm_R.c $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) -shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) +sbgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) +sbgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) +sbgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) +sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) @@ -2889,17 +2889,17 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h $(CC) -c $(PFLAGS) $< -o $(@F) -shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) +sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) -shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) +sbgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) -shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) +sbgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) -shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h - $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) +sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h + $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DBFLOAT16 -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) diff --git a/driver/others/parameter.c b/driver/others/parameter.c index b1f3befae..801fe5fa5 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -62,10 +62,10 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B; BLASLONG gemm_offset_b = GEMM_OFFSET_B; #endif -#if SHGEMM_P == shgemm_p -BLASLONG shgemm_p = DEFAULT_GEMM_P; +#if SBGEMM_P == sbgemm_p +BLASLONG sbgemm_p = DEFAULT_GEMM_P; #else -BLASLONG shgemm_p = SHGEMM_P; +BLASLONG sbgemm_p = SBGEMM_P; #endif #if SGEMM_P == sgemm_p BLASLONG sgemm_p = DEFAULT_GEMM_P; @@ -88,10 +88,10 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P; BLASLONG zgemm_p = ZGEMM_P; #endif -#if SHGEMM_Q == shgemm_q -BLASLONG shgemm_q = DEFAULT_GEMM_Q; +#if SBGEMM_Q == sbgemm_q +BLASLONG sbgemm_q = DEFAULT_GEMM_Q; #else -BLASLONG shgemm_q = SHGEMM_Q; +BLASLONG sbgemm_q = SBGEMM_Q; #endif #if SGEMM_Q == sgemm_q BLASLONG sgemm_q = DEFAULT_GEMM_Q; @@ -114,10 +114,10 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q; BLASLONG zgemm_q = ZGEMM_Q; #endif -#if SHGEMM_R == shgemm_r -BLASLONG shgemm_r = DEFAULT_GEMM_R; +#if SBGEMM_R == sbgemm_r +BLASLONG sbgemm_r = DEFAULT_GEMM_R; #else -BLASLONG shgemm_r = SHGEMM_R; +BLASLONG sbgemm_r = SBGEMM_R; #endif #if SGEMM_R == sgemm_r BLASLONG sgemm_r = DEFAULT_GEMM_R; @@ -612,7 +612,7 @@ void blas_set_parameter(void){ size = BITMASK(cpuid3, 16, 0xff); - shgemm_p = 192 * (size + 1); + sbgemm_p = 192 * (size + 1); sgemm_p = 192 * (size + 1); dgemm_p = 96 * (size + 1); cgemm_p = 96 * (size + 1); @@ -626,7 +626,7 @@ void blas_set_parameter(void){ xgemm_p = 16 * (size + 1); #endif - shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; + sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; diff --git a/exports/Makefile b/exports/Makefile index 75901586c..0618e90e4 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -30,8 +30,8 @@ ifndef BUILD_LAPACK_DEPRECATED BUILD_LAPACK_DEPRECATED = 0 endif -ifndef BUILD_HALF -BUILD_HALF = 0 +ifndef BUILD_BFLOAT16 +BUILD_BFLOAT16 = 0 endif ifeq ($(OSNAME), WINNT) @@ -246,23 +246,23 @@ static : ../$(LIBNAME) rm -f goto.$(SUFFIX) osx.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) > $(@F) aix.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) > $(@F) objcopy.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) > $(@F) objconv.def : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > $(@F) + perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) > $(@F) test : linktest.c $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. rm -f linktest linktest.c : gensymbol ../Makefile.system ../getarch.c - perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) > linktest.c + perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) > linktest.c clean :: @rm -f *.def *.dylib __.SYMDEF* *.renamed diff --git a/exports/gensymbol b/exports/gensymbol index 73b4be248..0de3f4b7d 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -46,7 +46,7 @@ ssum, dsum, scsum, dzsum ); -@halfblasobjs = (shgemm); +@halfblasobjs = (sbgemm); @cblasobjs = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -84,7 +84,7 @@ cblas_xerbla ); -@halfcblasobjs = (cblas_shgemm); +@halfcblasobjs = (cblas_sbgemm); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, diff --git a/getarch_2nd.c b/getarch_2nd.c index a1d0ccac8..c390ef52c 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -9,8 +9,8 @@ int main(int argc, char **argv) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { - printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); - printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); + printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M); + printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); diff --git a/interface/Makefile b/interface/Makefile index 2dbd60073..ba44d8889 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -46,8 +46,8 @@ SBLAS3OBJS = \ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ sgeadd.$(SUFFIX) -ifeq ($(BUILD_HALF),1) -SHBLAS3OBJS = shgemm.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SBBLAS3OBJS = sbgemm.$(SUFFIX) endif DBLAS1OBJS = \ @@ -280,8 +280,8 @@ CSBLAS3OBJS = \ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_sgeadd.$(SUFFIX) -ifeq ($(BUILD_HALF),1) -CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) endif CDBLAS1OBJS = \ @@ -374,7 +374,7 @@ override CFLAGS += -I. SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) -SHBLAS3OBJS += $(CSHBLAS3OBJS) +SBBLAS3OBJS += $(CSBBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS3OBJS += $(CDBLAS3OBJS) @@ -388,7 +388,7 @@ ZBLAS3OBJS += $(CZBLAS3OBJS) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) -SHBLASOBJS = $(SHBLAS3OBJS) +SBBLASOBJS = $(SBBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -463,7 +463,7 @@ ZBLASOBJS += $(ZLAPACKOBJS) endif -FUNCOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) +FUNCOBJS = $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) @@ -497,10 +497,10 @@ level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $ level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level3 : $(SHBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) +level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -$(CSHBLASOBJS) $(CSHBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ +$(CSBBLASOBJS) $(CSBBLASOBJS_P) $(CSBLASOBJS) $(CSBLASOBJS_P) $(CDBLASOBJS) $(CDBLASOBJS_P) $(CQBLASOBJS) $(CQBLASOBJS_P) \ $(CCBLASOBJS) $(CCBLASOBJS_P) $(CZBLASOBJS) $(CZBLASOBJS_P) $(CXBLASOBJS) $(CXBLASOBJS_P) : override CFLAGS += -DCBLAS srot.$(SUFFIX) srot.$(PSUFFIX) : rot.c @@ -1218,8 +1218,8 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c $(CC) -c $(CFLAGS) $< -o $(@F) -ifeq ($(BUILD_HALF),1) -shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h +ifeq ($(BUILD_BFLOAT16),1) +sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -c $(CFLAGS) $< -o $(@F) endif @@ -1784,8 +1784,8 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) -ifeq ($(BUILD_HALF),1) -cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) endif diff --git a/kernel/CMakeLists.txt b/kernel/CMakeLists.txt index d1349c5f8..a4a138d9b 100644 --- a/kernel/CMakeLists.txt +++ b/kernel/CMakeLists.txt @@ -41,8 +41,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) @@ -96,8 +96,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) @@ -134,13 +134,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) set(USE_TRMM true) endif () - foreach (float_type SINGLE DOUBLE HALF) + foreach (float_type SINGLE DOUBLE BFLOAT16) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - if (NOT ${BUILD_HALF}) + if (${float_type} STREQUAL "BFLOAT16") + if (NOT ${BUILD_BFLOAT16}) continue () else () - set (float_char "SH") + set (float_char "SB") endif () endif () GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) @@ -148,8 +148,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () if (${float_char}GEMMINCOPY) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) @@ -490,8 +490,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) # Makefile.LA if(NOT NO_LAPACK) foreach (float_type ${FLOAT_TYPES}) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () if (NOT DEFINED ${float_char}NEG_TCOPY) if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") @@ -536,8 +536,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS) foreach (float_type ${FLOAT_TYPES}) # a bit of metaprogramming here to pull out the appropriate KERNEL var string(SUBSTRING ${float_type} 0 1 float_char) - if (${float_type} STREQUAL "HALF") - set (float_char "SH") + if (${float_type} STREQUAL "BFLOAT16") + set (float_char "SB") endif () GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type}) GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 8df306d5f..dfd18d278 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -65,24 +65,24 @@ ifeq ($(CORE), Z14) USE_TRMM = 1 endif -ifeq ($(BUILD_HALF), 1) -ifndef SHGEMMKERNEL -SHGEMM_BETA = ../generic/gemm_beta.c -SHGEMMKERNEL = ../generic/gemmkernel_2x2.c -SHGEMMINCOPY = ../generic/gemm_ncopy_2.c -SHGEMMITCOPY = ../generic/gemm_tcopy_2.c -SHGEMMONCOPY = ../generic/gemm_ncopy_2.c -SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) -SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) -SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) -SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16), 1) +ifndef SBGEMMKERNEL +SBGEMM_BETA = ../generic/gemm_beta.c +SBGEMMKERNEL = ../generic/gemmkernel_2x2.c +SBGEMMINCOPY = ../generic/gemm_ncopy_2.c +SBGEMMITCOPY = ../generic/gemm_tcopy_2.c +SBGEMMONCOPY = ../generic/gemm_ncopy_2.c +SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) endif SHKERNELOBJS += \ - shgemm_kernel$(TSUFFIX).$(SUFFIX) \ - $(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ - $(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) + sbgemm_kernel$(TSUFFIX).$(SUFFIX) \ + $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \ + $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ) endif SKERNELOBJS += \ @@ -118,8 +118,8 @@ XKERNELOBJS += \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += $(SHKERNELOBJS) +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += $(SHKERNELOBJS) endif SBLASOBJS += $(SKERNELOBJS) DBLASOBJS += $(DKERNELOBJS) @@ -128,8 +128,8 @@ CBLASOBJS += $(CKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS) XBLASOBJS += $(XKERNELOBJS) -ifeq ($(BUILD_HALF),1) -SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX) endif SBLASOBJS += \ @@ -421,11 +421,11 @@ ZBLASOBJS += \ zimatcopy_k_ctc$(TSUFFIX).$(SUFFIX) zimatcopy_k_rtc$(TSUFFIX).$(SUFFIX) \ zgeadd_k$(TSUFFIX).$(SUFFIX) -ifeq ($(BUILD_HALF), 1) -SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +ifeq ($(BUILD_BFLOAT16), 1) +SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) +SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) endif SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) @@ -453,9 +453,9 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) -ifeq ($(BUILD_HALF),1) -$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) @@ -477,35 +477,35 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ -ifeq ($(BUILD_HALF), 1) +ifeq ($(BUILD_BFLOAT16), 1) -$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) +$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s - m4 shgemmotcopy.s > shgemmotcopy_nomacros.s - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ - rm shgemmotcopy.s shgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s + m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@ + rm sbgemmotcopy.s sbgemmotcopy_nomacros.s else - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif -ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) -$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY) + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) +$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s - m4 shgemmitcopy.s > shgemmitcopy_nomacros.s - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ - rm shgemmitcopy.s shgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s + m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@ + rm sbgemmitcopy.s sbgemmitcopy_nomacros.s else - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif endif @@ -668,16 +668,16 @@ else $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ endif -ifeq ($(BUILD_HALF), 1) +ifeq ($(BUILD_BFLOAT16), 1) -$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) +$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) ifeq ($(OS), AIX) - $(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s - m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ - rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s + m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@ + rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s else - $(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ + $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif endif @@ -2297,9 +2297,9 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_ $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ -ifeq ($(BUILD_HALF),1) -$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) @@ -2318,19 +2318,19 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA) $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ -ifeq ($(BUILD_HALF), 1) -$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16), 1) +$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) -$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N)) +$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ -$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif endif @@ -2440,9 +2440,9 @@ endif endif -ifeq ($(BUILD_HALF), 1) -$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) - $(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ +ifeq ($(BUILD_BFLOAT16), 1) +$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND) + $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@ endif $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) diff --git a/kernel/generic/gemmkernel_2x2.c b/kernel/generic/gemmkernel_2x2.c index cc7bb8e48..bf1c3ae38 100644 --- a/kernel/generic/gemmkernel_2x2.c +++ b/kernel/generic/gemmkernel_2x2.c @@ -1,5 +1,5 @@ #include "common.h" -#if defined(HALF) && defined(HALFCONVERSION) +#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) static float bfloat16tof32 (bfloat16 f16) { diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index f390fac61..371cef988 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -7,16 +7,16 @@ else #CGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c -SHGEMM_BETA = ../generic/gemm_beta.c -SHGEMMKERNEL = shgemm_kernel_power10.c -SHGEMMINCOPY = ../generic/gemm_ncopy_16.c -SHGEMMITCOPY = ../generic/gemm_tcopy_16.c -SHGEMMONCOPY = ../generic/gemm_ncopy_8.c -SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c -SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) -SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) -SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) -SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) +SBGEMM_BETA = ../generic/gemm_beta.c +SBGEMMKERNEL = sbgemm_kernel_power10.c +SBGEMMINCOPY = ../generic/gemm_ncopy_16.c +SBGEMMITCOPY = ../generic/gemm_tcopy_16.c +SBGEMMONCOPY = ../generic/gemm_ncopy_8.c +SBGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX) +SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX) +SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX) +SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX) STRMMKERNEL = sgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c diff --git a/kernel/power/shgemm_kernel_power10.c b/kernel/power/sbgemm_kernel_power10.c similarity index 99% rename from kernel/power/shgemm_kernel_power10.c rename to kernel/power/sbgemm_kernel_power10.c index 1ae9e04bf..46d82598a 100644 --- a/kernel/power/shgemm_kernel_power10.c +++ b/kernel/power/sbgemm_kernel_power10.c @@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. **********************************************************************************/ #include "common.h" #include -#if defined(HALF) && defined(HALFCONVERSION) +#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION) static float bfloat16tof32 (bfloat16 f16) { @@ -131,7 +131,7 @@ vector char mask = #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); /************************************************************************************* -* SHGEMM Kernel +* SBGEMM Kernel *************************************************************************************/ int CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index d3aa030c1..c9ee7691f 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -53,13 +53,13 @@ gotoblas_t TABLE_NAME = { GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, -#ifdef BUILD_HALF +#ifdef BUILD_BFLOAT16 0, 0, 0, - SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, -#ifdef SHGEMM_DEFAULT_UNROLL_MN - SHGEMM_DEFAULT_UNROLL_MN, + SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N, +#ifdef SBGEMM_DEFAULT_UNROLL_MN + SBGEMM_DEFAULT_UNROLL_MN, #else - MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), + MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N), #endif samax_kTS, samin_kTS, smax_kTS, smin_kTS, @@ -70,13 +70,13 @@ gotoblas_t TABLE_NAME = { sgemv_nTS, sgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, - shgemm_kernelTS, shgemm_betaTS, -#if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N - shgemm_incopyTS, shgemm_itcopyTS, + sbgemm_kernelTS, sbgemm_betaTS, +#if SBGEMM_DEFAULT_UNROLL_M != SBGEMM_DEFAULT_UNROLL_N + sbgemm_incopyTS, sbgemm_itcopyTS, #else - shgemm_oncopyTS, shgemm_otcopyTS, + sbgemm_oncopyTS, sbgemm_otcopyTS, #endif - shgemm_oncopyTS, shgemm_otcopyTS, + sbgemm_oncopyTS, sbgemm_otcopyTS, strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N @@ -707,24 +707,24 @@ gotoblas_t TABLE_NAME = { #if defined(ARCH_ARM64) static void init_parameter(void) { -#if defined(BUILD_HALF) - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#if defined(BUILD_BFLOAT16) + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -#if defined(BUILD_HALF) - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#if defined(BUILD_BFLOAT16) + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; -#if defined(BUILD_HALF) - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#if defined(BUILD_BFLOAT16) + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; @@ -789,16 +789,16 @@ static void init_parameter(void) { #if defined(ARCH_POWER) static void init_parameter(void) { -#ifdef BUILD_HALF - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; @@ -806,8 +806,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; @@ -818,16 +818,16 @@ static void init_parameter(void) { #if defined(ARCH_ZARCH) static void init_parameter(void) { -#ifdef BUILD_HALF - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; #endif TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; #endif TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; @@ -835,8 +835,8 @@ static void init_parameter(void) { TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; -#ifdef BUILD_HALF - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; @@ -977,10 +977,10 @@ static void init_parameter(void) { (void) l2; /* dirty trick to suppress unused variable warning for targets */ /* where the GEMM unrolling parameters do not depend on l2 */ -#ifdef BUILD_HALF - TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; - TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; - TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; +#ifdef BUILD_BFLOAT16 + TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P; + TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R; + TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q; #endif TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; diff --git a/lapack/CMakeLists.txt b/lapack/CMakeLists.txt index 778e6f8fa..7acfe2d68 100644 --- a/lapack/CMakeLists.txt +++ b/lapack/CMakeLists.txt @@ -2,7 +2,7 @@ include_directories(${PROJECT_SOURCE_DIR}) include_directories(${PROJECT_BINARY_DIR}) -list (REMOVE_ITEM FLOAT_TYPES "HALF") +list (REMOVE_ITEM FLOAT_TYPES "BFLOAT16") set(LAPACK_SOURCES potrf/potrf_U_single.c @@ -46,7 +46,7 @@ GenerateNamedObjects("laswp/generic/laswp_k_4.c" "" "laswp_plus" false "" "" fa GenerateNamedObjects("laswp/generic/laswp_k_4.c" "MINUS" "laswp_minus" false "" "" false 3) foreach (float_type ${FLOAT_TYPES}) -if (${float_type} STREQUAL "HALF") +if (${float_type} STREQUAL "BFLOAT16") continue() endif() GenerateNamedObjects("getrf/getrf_single.c" "UNIT" "getrf_single" false "" "" false ${float_type}) diff --git a/lapack/potrf/potrf_parallel.c b/lapack/potrf/potrf_parallel.c index 008fcb8cc..b323f9b37 100644 --- a/lapack/potrf/potrf_parallel.c +++ b/lapack/potrf/potrf_parallel.c @@ -380,9 +380,9 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){ #elif defined(DOUBLE) mode = BLAS_DOUBLE | BLAS_REAL; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; -#elif defined(HALF) - mode = BLAS_HALF | BLAS_REAL; - mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; +#elif defined(BFLOAT16) + mode = BLAS_BFLOAT16 | BLAS_REAL; + mask = MAX(SBGEMM_UNROLL_M, SBGEMM_UNROLL_N) - 1; #else mode = BLAS_SINGLE | BLAS_REAL; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; diff --git a/param.h b/param.h index 3e539a2b8..f549d802a 100644 --- a/param.h +++ b/param.h @@ -72,12 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifndef PARAM_H #define PARAM_H -#define SHGEMM_DEFAULT_UNROLL_N 4 -#define SHGEMM_DEFAULT_UNROLL_M 8 -#define SHGEMM_DEFAULT_UNROLL_MN 32 -#define SHGEMM_DEFAULT_P 256 -#define SHGEMM_DEFAULT_R 256 -#define SHGEMM_DEFAULT_Q 256 +#define SBGEMM_DEFAULT_UNROLL_N 4 +#define SBGEMM_DEFAULT_UNROLL_M 8 +#define SBGEMM_DEFAULT_UNROLL_MN 32 +#define SBGEMM_DEFAULT_P 256 +#define SBGEMM_DEFAULT_R 256 +#define SBGEMM_DEFAULT_Q 256 #ifdef OPTERON #define SNUMOPT 4 @@ -2308,16 +2308,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #if defined(POWER10) -#undef SHGEMM_DEFAULT_UNROLL_N -#undef SHGEMM_DEFAULT_UNROLL_M -#undef SHGEMM_DEFAULT_P -#undef SHGEMM_DEFAULT_R -#undef SHGEMM_DEFAULT_Q -#define SHGEMM_DEFAULT_UNROLL_M 16 -#define SHGEMM_DEFAULT_UNROLL_N 8 -#define SHGEMM_DEFAULT_P 832 -#define SHGEMM_DEFAULT_Q 1026 -#define SHGEMM_DEFAULT_R 4096 +#undef SBGEMM_DEFAULT_UNROLL_N +#undef SBGEMM_DEFAULT_UNROLL_M +#undef SBGEMM_DEFAULT_P +#undef SBGEMM_DEFAULT_R +#undef SBGEMM_DEFAULT_Q +#define SBGEMM_DEFAULT_UNROLL_M 16 +#define SBGEMM_DEFAULT_UNROLL_N 8 +#define SBGEMM_DEFAULT_P 832 +#define SBGEMM_DEFAULT_Q 1026 +#define SBGEMM_DEFAULT_R 4096 #endif #if defined(SPARC) && defined(V7) diff --git a/test/Makefile b/test/Makefile index 45f9821ec..38f9f432c 100644 --- a/test/Makefile +++ b/test/Makefile @@ -64,15 +64,15 @@ endif endif endif -ifeq ($(BUILD_HALF),1) -level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 +ifeq ($(BUILD_BFLOAT16),1) +level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3 else level3 : sblat3 dblat3 cblat3 zblat3 endif ifndef CROSS rm -f ?BLAT3.SUMM -ifeq ($(BUILD_HALF),1) - OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./sblat3 < ./sblat3.dat @@ -86,8 +86,8 @@ endif ifdef SMP rm -f ?BLAT3.SUMM ifeq ($(USE_OPENMP), 1) -ifeq ($(BUILD_HALF),1) - OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OMP_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif OMP_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @@ -99,8 +99,8 @@ endif OMP_NUM_THREADS=2 ./zblat3 < ./zblat3.dat @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 else -ifeq ($(BUILD_HALF),1) - OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM +ifeq ($(BUILD_BFLOAT16),1) + OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 endif OPENBLAS_NUM_THREADS=2 ./sblat3 < ./sblat3.dat @@ -181,9 +181,9 @@ zblat2 : zblat2.$(SUFFIX) ../$(LIBNAME) sblat3 : sblat3.$(SUFFIX) ../$(LIBNAME) $(FC) $(FLDFLAGS) -o sblat3 sblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) -ifeq ($(BUILD_HALF),1) -test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) - $(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) +ifeq ($(BUILD_BFLOAT16),1) +test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME) + $(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) endif dblat3 : dblat3.$(SUFFIX) ../$(LIBNAME) @@ -208,7 +208,7 @@ clean: @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ sblat1 dblat1 cblat1 zblat1 \ sblat2 dblat2 cblat2 zblat2 \ - test_shgemm sblat3 dblat3 cblat3 zblat3 \ + test_sbgemm sblat3 dblat3 cblat3 zblat3 \ sblat1p dblat1p cblat1p zblat1p \ sblat2p dblat2p cblat2p zblat2p \ sblat3p dblat3p cblat3p zblat3p \ diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_sbgemm.c similarity index 96% rename from test/compare_sgemm_shgemm.c rename to test/compare_sgemm_sbgemm.c index 57aee7b8f..3d4eb2515 100644 --- a/test/compare_sgemm_shgemm.c +++ b/test/compare_sgemm_sbgemm.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "../common.h" #define SGEMM BLASFUNC(sgemm) -#define SHGEMM BLASFUNC(shgemm) +#define SBGEMM BLASFUNC(sbgemm) typedef union { unsigned short v; @@ -102,7 +102,7 @@ main (int argc, char *argv[]) } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, &m, B, &k, &beta, C, &m); - SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, + SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) for (j = 0; j < m; j++) @@ -126,6 +126,6 @@ main (int argc, char *argv[]) } } if (ret != 0) - fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); + fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret); return ret; }