Merge pull request #2886 from martin-frbg/issue_2767

Rename "HALF" precision functions (sh prefix) to "BFLOAT16" with "sb" prefix
This commit is contained in:
Martin Kroeker 2020-10-13 00:04:35 +02:00 committed by GitHub
commit 08929430cd
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
42 changed files with 648 additions and 678 deletions

View File

@ -29,10 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc
else() else()
set(NO_AFFINITY 1) set(NO_AFFINITY 1)
endif() endif()
option(BUILD_SINGLE "Single precision" OFF) option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF)
option(BUILD_DOUBLE "Double precision" OFF) option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
option(BUILD_COMPLEX "Single precision" OFF)
option(BUILD_COMPLEX16 "Single precision" OFF)
# Add a prefix or suffix to all exported symbol names in the shared library. # Add a prefix or suffix to all exported symbol names in the shared library.
# Avoids conflicts with other BLAS libraries, especially when using # Avoids conflicts with other BLAS libraries, especially when using
@ -91,13 +89,13 @@ if (NOT NO_LAPACK)
list(APPEND SUBDIRS lapack) list(APPEND SUBDIRS lapack)
endif () endif ()
if (NOT DEFINED BUILD_HALF) if (NOT DEFINED BUILD_BFLOAT16)
set (BUILD_HALF false) set (BUILD_BFLOAT16 false)
endif () endif ()
# set which float types we want to build for # set which float types we want to build for
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16) if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
# if none are defined, build for all # if none are defined, build for all
# set(BUILD_HALF true) # set(BUILD_BFLOAT16 true)
set(BUILD_SINGLE true) set(BUILD_SINGLE true)
set(BUILD_DOUBLE true) set(BUILD_DOUBLE true)
set(BUILD_COMPLEX true) set(BUILD_COMPLEX true)
@ -110,33 +108,28 @@ endif()
set(FLOAT_TYPES "") set(FLOAT_TYPES "")
if (BUILD_SINGLE) if (BUILD_SINGLE)
message(STATUS "Building Songle Precision") message(STATUS "Building Single Precision")
list(APPEND FLOAT_TYPES "SINGLE") list(APPEND FLOAT_TYPES "SINGLE") # defines nothing
# set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1")
endif () endif ()
if (BUILD_DOUBLE) if (BUILD_DOUBLE)
message(STATUS "Building Double Precision") message(STATUS "Building Double Precision")
list(APPEND FLOAT_TYPES "DOUBLE") list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1")
endif () endif ()
if (BUILD_COMPLEX) if (BUILD_COMPLEX)
message(STATUS "Building Complex Precision") message(STATUS "Building Complex Precision")
list(APPEND FLOAT_TYPES "COMPLEX") list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1")
endif () endif ()
if (BUILD_COMPLEX16) if (BUILD_COMPLEX16)
message(STATUS "Building Double Complex Precision") message(STATUS "Building Double Complex Precision")
list(APPEND FLOAT_TYPES "ZCOMPLEX") list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1")
endif () endif ()
if (BUILD_HALF) if (BUILD_BFLOAT16)
message(STATUS "Building Half Precision") message(STATUS "Building Half Precision")
list(APPEND FLOAT_TYPES "HALF") list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF")
endif () endif ()
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN") if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
@ -243,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN)
add_subdirectory(ctest) add_subdirectory(ctest)
endif() endif()
add_subdirectory(lapack-netlib/TESTING) add_subdirectory(lapack-netlib/TESTING)
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
add_subdirectory(cpp_thread_test)
endif()
endif() endif()
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES

View File

@ -272,17 +272,33 @@ COMMON_PROF = -pg
# work at all. # work at all.
# #
# CPP_THREAD_SAFETY_TEST = 1 # CPP_THREAD_SAFETY_TEST = 1
#
# use this to run only the less memory-hungry GEMV test
# CPP_THREAD_SAFETY_GEMV = 1
# If you want to enable the experimental BFLOAT16 support # If you want to enable the experimental BFLOAT16 support
# BUILD_HALF = 1 # BUILD_BFLOAT16 = 1
#
# Select if you need to build only select types
# BUILD_SINGLE = 1 # Set the thread number threshold beyond which the job array for the threaded level3 BLAS
# BUILD_DOUBLE = 1 # will be allocated on the heap rather than the stack. (This array alone requires
# BUILD_COMPLEX = 1 # NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu
# BUILD_COMPLEX16 = 1 # counts, but obviously it is not the only item that ends up on the stack.
# # The default value of 32 ensures that the overall requirement is compatible
# # with the default 1MB stacksize imposed by having the Java VM loaded without use
# of its -Xss parameter.
# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible
# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java
# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code
# BLAS3_MEM_ALLOC_THRESHOLD = 160
# the below is not yet configurable, use cmake if you need to build only select types
BUILD_SINGLE = 1
BUILD_DOUBLE = 1
BUILD_COMPLEX = 1
BUILD_COMPLEX16 = 1
# End of user configuration # End of user configuration
# #

View File

@ -1232,8 +1232,8 @@ ifeq ($(USE_TLS), 1)
CCOMMON_OPT += -DUSE_TLS CCOMMON_OPT += -DUSE_TLS
endif endif
ifeq ($(BUILD_HALF), 1) ifeq ($(BUILD_BFLOAT16), 1)
CCOMMON_OPT += -DBUILD_HALF CCOMMON_OPT += -DBUILD_BFLOAT16
endif endif
ifeq ($(BUILD_SINGLE), 1) ifeq ($(BUILD_SINGLE), 1)
CCOMMON_OPT += -DBUILD_SINGLE=1 CCOMMON_OPT += -DBUILD_SINGLE=1
@ -1521,10 +1521,10 @@ export KERNELDIR
export FUNCTION_PROFILE export FUNCTION_PROFILE
export TARGET_CORE export TARGET_CORE
export NO_AVX512 export NO_AVX512
export BUILD_HALF export BUILD_BFLOAT16
export SHGEMM_UNROLL_M export SBGEMM_UNROLL_M
export SHGEMM_UNROLL_N export SBGEMM_UNROLL_N
export SGEMM_UNROLL_M export SGEMM_UNROLL_M
export SGEMM_UNROLL_N export SGEMM_UNROLL_N
export DGEMM_UNROLL_M export DGEMM_UNROLL_M

View File

@ -24,14 +24,14 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P) BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
endif endif
$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX $(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX $(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX $(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
$(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX
$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX $(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)

View File

@ -49,8 +49,8 @@ else
GOTO_LAPACK_TARGETS= GOTO_LAPACK_TARGETS=
endif endif
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
GOTO_HALF_TARGETS=shgemm.goto GOTO_HALF_TARGETS=sbgemm.goto
else else
GOTO_HALF_TARGETS= GOTO_HALF_TARGETS=
endif endif
@ -620,8 +620,8 @@ zcholesky.essl : zcholesky.$(SUFFIX)
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
##################################### Sgemm #################################################### ##################################### Sgemm ####################################################
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME)
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
endif endif
@ -2927,8 +2927,8 @@ ccholesky.$(SUFFIX) : cholesky.c
zcholesky.$(SUFFIX) : cholesky.c zcholesky.$(SUFFIX) : cholesky.c
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
shgemm.$(SUFFIX) : gemm.c sbgemm.$(SUFFIX) : gemm.c
$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
endif endif

View File

@ -40,7 +40,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifdef DOUBLE #ifdef DOUBLE
#define GEMM BLASFUNC(dgemm) #define GEMM BLASFUNC(dgemm)
#elif defined(HALF) #elif defined(HALF)
#define GEMM BLASFUNC(shgemm) #define GEMM BLASFUNC(sbgemm)
#else #else
#define GEMM BLASFUNC(sgemm) #define GEMM BLASFUNC(sgemm)
#endif #endif

View File

@ -392,7 +392,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
/* convert BFLOAT16 array to double array */ /* convert BFLOAT16 array to double array */
void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
/* dot production of BFLOAT16 input arrays, and output as float */ /* dot production of BFLOAT16 input arrays, and output as float */
float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -113,7 +113,7 @@ macro(SetDefaultL1)
set(ZSUMKERNEL zsum.S) set(ZSUMKERNEL zsum.S)
set(QSUMKERNEL sum.S) set(QSUMKERNEL sum.S)
set(XSUMKERNEL zsum.S) set(XSUMKERNEL zsum.S)
if (BUILD_HALF) if (BUILD_BFLOAT16)
set(SHAMINKERNEL ../arm/amin.c) set(SHAMINKERNEL ../arm/amin.c)
set(SHAMAXKERNEL ../arm/amax.c) set(SHAMAXKERNEL ../arm/amax.c)
set(SHMAXKERNEL ../arm/max.c) set(SHMAXKERNEL ../arm/max.c)
@ -126,7 +126,7 @@ if (BUILD_HALF)
set(SHAXPYKERNEL ../arm/axpy.c) set(SHAXPYKERNEL ../arm/axpy.c)
set(SHAXPBYKERNEL ../arm/axpby.c) set(SHAXPBYKERNEL ../arm/axpby.c)
set(SHCOPYKERNEL ../arm/copy.c) set(SHCOPYKERNEL ../arm/copy.c)
set(SHDOTKERNEL ../x86_64/shdot.c) set(SBDOTKERNEL ../x86_64/sbdot.c)
set(SHROTKERNEL ../arm/rot.c) set(SHROTKERNEL ../arm/rot.c)
set(SHSCALKERNEL ../arm/scal.c) set(SHSCALKERNEL ../arm/scal.c)
set(SHNRM2KERNEL ../arm/nrm2.c) set(SHNRM2KERNEL ../arm/nrm2.c)
@ -183,9 +183,9 @@ macro(SetDefaultL2)
set(XHEMV_L_KERNEL ../generic/zhemv_k.c) set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
set(XHEMV_M_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
if (BUILD_HALF) if (BUILD_BFLOAT16)
set(SHGEMVNKERNEL ../arm/gemv_n.c) set(SBGEMVNKERNEL ../arm/gemv_n.c)
set(SHGEMVTKERNEL ../arm/gemv_t.c) set(SBGEMVTKERNEL ../arm/gemv_t.c)
set(SHGERKERNEL ../generic/ger.c) set(SHGERKERNEL ../generic/ger.c)
endif () endif ()
endmacro () endmacro ()
@ -195,18 +195,18 @@ macro(SetDefaultL3)
set(DGEADD_KERNEL ../generic/geadd.c) set(DGEADD_KERNEL ../generic/geadd.c)
set(CGEADD_KERNEL ../generic/zgeadd.c) set(CGEADD_KERNEL ../generic/zgeadd.c)
set(ZGEADD_KERNEL ../generic/zgeadd.c) set(ZGEADD_KERNEL ../generic/zgeadd.c)
if (BUILD_HALF) if (BUILD_BFLOAT16)
set(SHGEADD_KERNEL ../generic/geadd.c) set(SHGEADD_KERNEL ../generic/geadd.c)
set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c) set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
set(SHGEMM_BETA ../generic/gemm_beta.c) set(SBGEMM_BETA ../generic/gemm_beta.c)
set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c) set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c) set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c) set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c) set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
set(SHGEMMINCOPYOBJ shgemm_incopy.o) set(SBGEMMINCOPYOBJ sbgemm_incopy.o)
set(SHGEMMITCOPYOBJ shgemm_itcopy.o) set(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
set(SHGEMMONCOPYOBJ shgemm_oncopy.o) set(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
set(SHGEMMOTCOPYOBJ shgemm_otcopy.o) set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
endif () endif ()
endmacro () endmacro ()

View File

@ -16,8 +16,8 @@
# HAVE_SSE2 # HAVE_SSE2
# HAVE_SSE3 # HAVE_SSE3
# MAKE # MAKE
# SHGEMM_UNROLL_M # SBGEMM_UNROLL_M
# SHGEMM_UNROLL_N # SBGEMM_UNROLL_N
# SGEMM_UNROLL_M # SGEMM_UNROLL_M
# SGEMM_UNROLL_N # SGEMM_UNROLL_N
# DGEMM_UNROLL_M # DGEMM_UNROLL_M
@ -471,8 +471,8 @@ endif ()
set(ZGEMM_UNROLL_N 2) set(ZGEMM_UNROLL_N 2)
set(SYMV_P 8) set(SYMV_P 8)
endif() endif()
set(SHGEMM_UNROLL_M 8) set(SBGEMM_UNROLL_M 8)
set(SHGEMM_UNROLL_N 4) set(SBGEMM_UNROLL_N 4)
# Or should this actually be NUM_CORES? # Or should this actually be NUM_CORES?
if (${NUM_THREADS} GREATER 0) if (${NUM_THREADS} GREATER 0)

View File

@ -326,7 +326,13 @@ else ()
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048") set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048")
endif () endif ()
endif () endif ()
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD)
if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32)
set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}")
endif()
endif()
endif()
if (DEFINED LIBNAMESUFFIX) if (DEFINED LIBNAMESUFFIX)
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}") set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
else () else ()
@ -404,20 +410,16 @@ if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_CO
set (BUILD_COMPLEX16 ON) set (BUILD_COMPLEX16 ON)
endif() endif()
if (BUILD_SINGLE) if (BUILD_SINGLE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE=1") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE")
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1")
endif() endif()
if (BUILD_DOUBLE) if (BUILD_DOUBLE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE")
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1")
endif() endif()
if (BUILD_COMPLEX) if (BUILD_COMPLEX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX")
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX=1")
endif() endif()
if (BUILD_COMPLEX16) if (BUILD_COMPLEX16)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX16=1")
endif() endif()
if(NOT MSVC) if(NOT MSVC)
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
@ -591,8 +593,8 @@ endif ()
#export FUNCTION_PROFILE #export FUNCTION_PROFILE
#export TARGET_CORE #export TARGET_CORE
# #
#export SHGEMM_UNROLL_M #export SBGEMM_UNROLL_M
#export SHGEMM_UNROLL_N #export SBGEMM_UNROLL_N
#export SGEMM_UNROLL_M #export SGEMM_UNROLL_M
#export SGEMM_UNROLL_N #export SGEMM_UNROLL_N
#export DGEMM_UNROLL_M #export DGEMM_UNROLL_M

View File

@ -211,7 +211,7 @@ function(GenerateNamedObjects sources_in)
if (complex_only) if (complex_only)
list(REMOVE_ITEM float_list "SINGLE") list(REMOVE_ITEM float_list "SINGLE")
list(REMOVE_ITEM float_list "DOUBLE") list(REMOVE_ITEM float_list "DOUBLE")
list(REMOVE_ITEM float_list "HALF") list(REMOVE_ITEM float_list "BFLOAT16")
elseif (real_only) elseif (real_only)
list(REMOVE_ITEM float_list "COMPLEX") list(REMOVE_ITEM float_list "COMPLEX")
list(REMOVE_ITEM float_list "ZCOMPLEX") list(REMOVE_ITEM float_list "ZCOMPLEX")
@ -225,8 +225,8 @@ function(GenerateNamedObjects sources_in)
if (NOT no_float_type) if (NOT no_float_type)
string(SUBSTRING ${float_type} 0 1 float_char) string(SUBSTRING ${float_type} 0 1 float_char)
string(TOLOWER ${float_char} float_char) string(TOLOWER ${float_char} float_char)
if (${float_type} STREQUAL "HALF") if (${float_type} STREQUAL "BFLOAT16")
set (float_char "sh") set (float_char "sb")
endif () endif ()
endif () endif ()
@ -262,8 +262,8 @@ function(GenerateNamedObjects sources_in)
if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX") if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX")
list(APPEND obj_defines "DOUBLE") list(APPEND obj_defines "DOUBLE")
endif () endif ()
if (${float_type} STREQUAL "HALF") if (${float_type} STREQUAL "BFLOAT16")
list(APPEND obj_defines "HALF") list(APPEND obj_defines "BFLOAT16")
endif () endif ()
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
list(APPEND obj_defines "COMPLEX") list(APPEND obj_defines "COMPLEX")

View File

@ -260,7 +260,7 @@ typedef unsigned long BLASULONG;
#ifndef BFLOAT16 #ifndef BFLOAT16
#include <stdint.h> #include <stdint.h>
typedef uint16_t bfloat16; typedef uint16_t bfloat16;
#define HALFCONVERSION 1 #define BFLOAT16CONVERSION 1
#endif #endif
#ifdef USE64BITINT #ifdef USE64BITINT
@ -303,7 +303,7 @@ typedef int blasint;
#define SIZE 8 #define SIZE 8
#define BASE_SHIFT 3 #define BASE_SHIFT 3
#define ZBASE_SHIFT 4 #define ZBASE_SHIFT 4
#elif defined(HALF) #elif defined(BFLOAT16)
#define IFLOAT bfloat16 #define IFLOAT bfloat16
#define XFLOAT IFLOAT #define XFLOAT IFLOAT
#define FLOAT float #define FLOAT float

View File

@ -54,7 +54,7 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *);
double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *);
xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
float BLASFUNC(shdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *); float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *); void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *); void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *); void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *);
@ -474,7 +474,7 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint
/* Level 3 routines */ /* Level 3 routines */
void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *, void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *); bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *, void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
float *, blasint *, float *, blasint *, float *, float *, blasint *); float *, blasint *, float *, blasint *, float *, float *, blasint *);

View File

@ -46,7 +46,7 @@ float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG);
double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG);
double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG);
xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
float shdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); float sbdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);

View File

@ -55,7 +55,7 @@ void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K,
int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K); int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float, int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -78,10 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
#endif #endif
int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b); int sbgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b); int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
@ -505,7 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl
int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag); int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG); int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
@ -534,10 +534,10 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float
int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG); int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG); int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
@ -631,10 +631,10 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON
int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG); int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
#endif #endif
int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG); int sbgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG); int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);

View File

@ -39,7 +39,7 @@
#ifndef COMMON_MACRO #ifndef COMMON_MACRO
#define COMMON_MACRO #define COMMON_MACRO
#include "common_sh.h" #include "common_sb.h"
#include "common_s.h" #include "common_s.h"
#include "common_d.h" #include "common_d.h"
#include "common_q.h" #include "common_q.h"
@ -644,7 +644,7 @@
#define GEADD_K DGEADD_K #define GEADD_K DGEADD_K
#elif defined(HALF) #elif defined(BFLOAT16)
#define D_TO_BF16_K SHDTOBF16_K #define D_TO_BF16_K SHDTOBF16_K
#define D_BF16_TO_K DBF16TOD_K #define D_BF16_TO_K DBF16TOD_K
@ -662,7 +662,7 @@
#define ASUM_K SASUM_K #define ASUM_K SASUM_K
#define DOTU_K SDOTU_K #define DOTU_K SDOTU_K
#define DOTC_K SDOTC_K #define DOTC_K SDOTC_K
#define BF16_DOT_K SHDOT_K #define BF16_DOT_K SBDOT_K
#define AXPYU_K SAXPYU_K #define AXPYU_K SAXPYU_K
#define AXPYC_K SAXPYC_K #define AXPYC_K SAXPYC_K
#define AXPBY_K SAXPBY_K #define AXPBY_K SAXPBY_K
@ -682,32 +682,32 @@
#define NRM2_K SNRM2_K #define NRM2_K SNRM2_K
#define SYMV_THREAD_U SSYMV_THREAD_U #define SYMV_THREAD_U SSYMV_THREAD_U
#define SYMV_THREAD_L SSYMV_THREAD_L #define SYMV_THREAD_L SSYMV_THREAD_L
#define GEMM_BETA SHGEMM_BETA #define GEMM_BETA SBGEMM_BETA
#define GEMM_KERNEL_N SHGEMM_KERNEL #define GEMM_KERNEL_N SBGEMM_KERNEL
#define GEMM_KERNEL_L SHGEMM_KERNEL #define GEMM_KERNEL_L SBGEMM_KERNEL
#define GEMM_KERNEL_R SHGEMM_KERNEL #define GEMM_KERNEL_R SBGEMM_KERNEL
#define GEMM_KERNEL_B SHGEMM_KERNEL #define GEMM_KERNEL_B SBGEMM_KERNEL
#define GEMM_NN SHGEMM_NN #define GEMM_NN SBGEMM_NN
#define GEMM_CN SHGEMM_TN #define GEMM_CN SBGEMM_TN
#define GEMM_TN SHGEMM_TN #define GEMM_TN SBGEMM_TN
#define GEMM_NC SHGEMM_NT #define GEMM_NC SBGEMM_NT
#define GEMM_NT SHGEMM_NT #define GEMM_NT SBGEMM_NT
#define GEMM_CC SHGEMM_TT #define GEMM_CC SBGEMM_TT
#define GEMM_CT SHGEMM_TT #define GEMM_CT SBGEMM_TT
#define GEMM_TC SHGEMM_TT #define GEMM_TC SBGEMM_TT
#define GEMM_TT SHGEMM_TT #define GEMM_TT SBGEMM_TT
#define GEMM_NR SHGEMM_NN #define GEMM_NR SBGEMM_NN
#define GEMM_TR SHGEMM_TN #define GEMM_TR SBGEMM_TN
#define GEMM_CR SHGEMM_TN #define GEMM_CR SBGEMM_TN
#define GEMM_RN SHGEMM_NN #define GEMM_RN SBGEMM_NN
#define GEMM_RT SHGEMM_NT #define GEMM_RT SBGEMM_NT
#define GEMM_RC SHGEMM_NT #define GEMM_RC SBGEMM_NT
#define GEMM_RR SHGEMM_NN #define GEMM_RR SBGEMM_NN
#define GEMM_ONCOPY SHGEMM_ONCOPY #define GEMM_ONCOPY SBGEMM_ONCOPY
#define GEMM_OTCOPY SHGEMM_OTCOPY #define GEMM_OTCOPY SBGEMM_OTCOPY
#define GEMM_INCOPY SHGEMM_INCOPY #define GEMM_INCOPY SBGEMM_INCOPY
#define GEMM_ITCOPY SHGEMM_ITCOPY #define GEMM_ITCOPY SBGEMM_ITCOPY
#define SYMM_THREAD_LU SSYMM_THREAD_LU #define SYMM_THREAD_LU SSYMM_THREAD_LU
#define SYMM_THREAD_LL SSYMM_THREAD_LL #define SYMM_THREAD_LL SSYMM_THREAD_LL
#define SYMM_THREAD_RU SSYMM_THREAD_RU #define SYMM_THREAD_RU SSYMM_THREAD_RU
@ -723,22 +723,22 @@
#define HEMM_THREAD_RU SHEMM_THREAD_RU #define HEMM_THREAD_RU SHEMM_THREAD_RU
#define HEMM_THREAD_RL SHEMM_THREAD_RL #define HEMM_THREAD_RL SHEMM_THREAD_RL
#define GEMM_THREAD_NN SHGEMM_THREAD_NN #define GEMM_THREAD_NN SBGEMM_THREAD_NN
#define GEMM_THREAD_CN SHGEMM_THREAD_TN #define GEMM_THREAD_CN SBGEMM_THREAD_TN
#define GEMM_THREAD_TN SHGEMM_THREAD_TN #define GEMM_THREAD_TN SBGEMM_THREAD_TN
#define GEMM_THREAD_NC SHGEMM_THREAD_NT #define GEMM_THREAD_NC SBGEMM_THREAD_NT
#define GEMM_THREAD_NT SHGEMM_THREAD_NT #define GEMM_THREAD_NT SBGEMM_THREAD_NT
#define GEMM_THREAD_CC SHGEMM_THREAD_TT #define GEMM_THREAD_CC SBGEMM_THREAD_TT
#define GEMM_THREAD_CT SHGEMM_THREAD_TT #define GEMM_THREAD_CT SBGEMM_THREAD_TT
#define GEMM_THREAD_TC SHGEMM_THREAD_TT #define GEMM_THREAD_TC SBGEMM_THREAD_TT
#define GEMM_THREAD_TT SHGEMM_THREAD_TT #define GEMM_THREAD_TT SBGEMM_THREAD_TT
#define GEMM_THREAD_NR SHGEMM_THREAD_NN #define GEMM_THREAD_NR SBGEMM_THREAD_NN
#define GEMM_THREAD_TR SHGEMM_THREAD_TN #define GEMM_THREAD_TR SBGEMM_THREAD_TN
#define GEMM_THREAD_CR SHGEMM_THREAD_TN #define GEMM_THREAD_CR SBGEMM_THREAD_TN
#define GEMM_THREAD_RN SHGEMM_THREAD_NN #define GEMM_THREAD_RN SBGEMM_THREAD_NN
#define GEMM_THREAD_RT SHGEMM_THREAD_NT #define GEMM_THREAD_RT SBGEMM_THREAD_NT
#define GEMM_THREAD_RC SHGEMM_THREAD_NT #define GEMM_THREAD_RC SBGEMM_THREAD_NT
#define GEMM_THREAD_RR SHGEMM_THREAD_NN #define GEMM_THREAD_RR SBGEMM_THREAD_NN
#ifdef UNIT #ifdef UNIT
@ -2491,9 +2491,9 @@
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64) #if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
extern BLASLONG gemm_offset_a; extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b; extern BLASLONG gemm_offset_b;
extern BLASLONG shgemm_p; extern BLASLONG sbgemm_p;
extern BLASLONG shgemm_q; extern BLASLONG sbgemm_q;
extern BLASLONG shgemm_r; extern BLASLONG sbgemm_r;
extern BLASLONG sgemm_p; extern BLASLONG sgemm_p;
extern BLASLONG sgemm_q; extern BLASLONG sgemm_q;
extern BLASLONG sgemm_r; extern BLASLONG sgemm_r;

View File

@ -47,114 +47,114 @@ typedef struct {
int dtb_entries; int dtb_entries;
int offsetA, offsetB, align; int offsetA, offsetB, align;
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
int shgemm_p, shgemm_q, shgemm_r; int sbgemm_p, sbgemm_q, sbgemm_r;
int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
void (*sbf16tos_k) (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); void (*sbf16tos_k) (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
void (*dbf16tod_k) (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG); void (*dbf16tod_k) (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG);
float (*shamax_k) (BLASLONG, float *, BLASLONG); float (*sbamax_k) (BLASLONG, float *, BLASLONG);
float (*shamin_k) (BLASLONG, float *, BLASLONG); float (*sbamin_k) (BLASLONG, float *, BLASLONG);
float (*shmax_k) (BLASLONG, float *, BLASLONG); float (*sbmax_k) (BLASLONG, float *, BLASLONG);
float (*shmin_k) (BLASLONG, float *, BLASLONG); float (*sbmin_k) (BLASLONG, float *, BLASLONG);
BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*isbamax_k)(BLASLONG, float *, BLASLONG);
BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG); BLASLONG (*isbamin_k)(BLASLONG, float *, BLASLONG);
BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG); BLASLONG (*isbmax_k) (BLASLONG, float *, BLASLONG);
BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG); BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
float (*shnrm2_k) (BLASLONG, float *, BLASLONG); float (*sbnrm2_k) (BLASLONG, float *, BLASLONG);
float (*shasum_k) (BLASLONG, float *, BLASLONG); float (*sbasum_k) (BLASLONG, float *, BLASLONG);
float (*shsum_k) (BLASLONG, float *, BLASLONG); float (*sbsum_k) (BLASLONG, float *, BLASLONG);
int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*shdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG); float (*sbdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG); int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG); int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*sbgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*sbgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*sbgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *); int (*sbgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*sbtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*sbtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*sbtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*sbtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*sbtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*sbtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*sbtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*sbtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*sbtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*sbsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
#endif #endif
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16) #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
int sgemm_p, sgemm_q, sgemm_r; int sgemm_p, sgemm_q, sgemm_r;
int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn; int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn;
#endif #endif
int exclusive_cache; int exclusive_cache;
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
float (*samax_k) (BLASLONG, float *, BLASLONG); float (*samax_k) (BLASLONG, float *, BLASLONG);
float (*samin_k) (BLASLONG, float *, BLASLONG); float (*samin_k) (BLASLONG, float *, BLASLONG);
float (*smax_k) (BLASLONG, float *, BLASLONG); float (*smax_k) (BLASLONG, float *, BLASLONG);
@ -167,11 +167,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
float (*snrm2_k) (BLASLONG, float *, BLASLONG); float (*snrm2_k) (BLASLONG, float *, BLASLONG);
float (*sasum_k) (BLASLONG, float *, BLASLONG); float (*sasum_k) (BLASLONG, float *, BLASLONG);
#endif #endif
#ifdef BUILD_SINGLE
#if BUILD_SINGLE
float (*ssum_k) (BLASLONG, float *, BLASLONG); float (*ssum_k) (BLASLONG, float *, BLASLONG);
#endif #endif
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) #if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
//double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); //double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -179,26 +178,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
#endif
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16)
int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
#endif
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX)
int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
#endif #endif
#ifdef BUILD_SINGLE
#if BUILD_SINGLE
int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
#endif #endif
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX)
#ifdef ARCH_X86_64 #ifdef ARCH_X86_64
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG); void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K); int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
@ -213,8 +206,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
#endif #endif
#ifdef BUILD_SINGLE
#if (BUILD_SINGLE) || (BUILD_DOUBLE)
int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
@ -236,8 +228,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
#endif
#if BUILD_SINGLE
int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG); int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
@ -264,18 +255,17 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *); int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
#endif
#if (BUILD_SINGLE) || (BUILD_DOUBLE)
int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
#endif #endif
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
int dgemm_p, dgemm_q, dgemm_r; int dgemm_p, dgemm_q, dgemm_r;
int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn; int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn;
#endif #endif
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
double (*damax_k) (BLASLONG, double *, BLASLONG); double (*damax_k) (BLASLONG, double *, BLASLONG);
double (*damin_k) (BLASLONG, double *, BLASLONG); double (*damin_k) (BLASLONG, double *, BLASLONG);
double (*dmax_k) (BLASLONG, double *, BLASLONG); double (*dmax_k) (BLASLONG, double *, BLASLONG);
@ -286,21 +276,21 @@ BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG);
BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG); BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
#endif #endif
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
double (*dnrm2_k) (BLASLONG, double *, BLASLONG); double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
double (*dasum_k) (BLASLONG, double *, BLASLONG); double (*dasum_k) (BLASLONG, double *, BLASLONG);
#endif #endif
#if BUILD_DOUBLE #ifdef BUILD_DOUBLE
double (*dsum_k) (BLASLONG, double *, BLASLONG); double (*dsum_k) (BLASLONG, double *, BLASLONG);
#endif #endif
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG); double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
#endif #endif
#if (BUILD_SINGLE) || (BUILD_DOUBLE) #if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE)
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
#endif #endif
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16) #if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double); int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
@ -308,15 +298,13 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
#endif #endif
#ifdef BUILD_DOUBLE
#if BUILD_DOUBLE
int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
#endif #endif
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16)
int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG); int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG); int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
@ -325,8 +313,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *); int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
#endif #endif
#ifdef BUILD_DOUBLE
#if BUILD_DOUBLE
int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG); int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
@ -473,30 +460,23 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
#endif #endif
#ifdef BUILD_COMPLEX
#if (BUILD_COMPLEX) || (BUILD_COMPLEX16)
int cgemm_p, cgemm_q, cgemm_r; int cgemm_p, cgemm_q, cgemm_r;
int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn; int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn;
float (*camax_k) (BLASLONG, float *, BLASLONG); float (*camax_k) (BLASLONG, float *, BLASLONG);
float (*camin_k) (BLASLONG, float *, BLASLONG); float (*camin_k) (BLASLONG, float *, BLASLONG);
BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG);
BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG); BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
#endif
#if BUILD_COMPLEX
float (*cnrm2_k) (BLASLONG, float *, BLASLONG); float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
float (*casum_k) (BLASLONG, float *, BLASLONG); float (*casum_k) (BLASLONG, float *, BLASLONG);
float (*csum_k) (BLASLONG, float *, BLASLONG); float (*csum_k) (BLASLONG, float *, BLASLONG);
#endif
#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16)
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
#endif
#if BUILD_COMPLEX
int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
#endif
#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16)
int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
@ -510,8 +490,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
#endif
#if (BUILD_COMPLEX)
int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
@ -523,14 +501,13 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
#endif
#if (BUILD_COMPLEX) || (BUILD_COMPLEX16)
int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG); int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
@ -561,8 +538,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *); int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
#endif
#if (BUILD_COMPLEX)
int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG); int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
@ -646,14 +621,12 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *); int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
#endif
#if (BUILD_COMPLEX) || (BUILD_COMPLEX16)
int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *); int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *); int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
#endif #endif
#ifdef BUILD_COMPLEX16
#if BUILD_COMPLEX16
int zgemm_p, zgemm_q, zgemm_r; int zgemm_p, zgemm_q, zgemm_r;
int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn; int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn;
@ -991,35 +964,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
void (*init)(void); void (*init)(void);
int snum_opt, dnum_opt, qnum_opt; int snum_opt, dnum_opt, qnum_opt;
#ifdef BUILD_SINGLE
#if BUILD_SINGLE
int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG); int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG);
#endif #endif
#if BUILD_DOUBLE #ifdef BUILD_DOUBLE
int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG); int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG);
#endif #endif
#if BUILD_COMPLEX #ifdef BUILD_COMPLEX
int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG); int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG);
#endif #endif
#if BUILD_COMPLEX16 #ifdef BUILD_COMPLEX16
int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG); int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG);
#endif #endif
#if BUILD_SINGLE #ifdef BUILD_SINGLE
int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG); int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
#endif #endif
#if BUILD_DOUBLE #ifdef BUILD_DOUBLE
int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG); int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
#endif #endif
#if BUILD_COMPLEX #ifdef BUILD_COMPLEX
int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
@ -1031,7 +1003,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG); int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
#endif #endif
#if BUILD_COMPLEX16 #ifdef BUILD_COMPLEX16
int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
@ -1043,21 +1015,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG); int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
#endif #endif
#if BUILD_SINGLE #ifdef BUILD_SINGLE
int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG);
int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG);
int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG);
int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG); int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG);
#endif #endif
#if BUILD_DOUBLE #ifdef BUILD_DOUBLE
int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG);
int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG);
int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG);
int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG); int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG);
#endif #endif
#if BUILD_COMPLEX #ifdef BUILD_COMPLEX
int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
@ -1069,7 +1041,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG); int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
#endif #endif
#if BUILD_COMPLEX16 #ifdef BUILD_COMPLEX16
int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
@ -1081,16 +1053,16 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG); int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
#endif #endif
#if BUILD_SINGLE #ifdef BUILD_SINGLE
int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG); int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);
#endif #endif
#if BUILD_DOUBLE #ifdef BUILD_DOUBLE
int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG); int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG);
#endif #endif
#if BUILD_COMPLEX #ifdef BUILD_COMPLEX
int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG); int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG);
#endif #endif
#if BUILD_COMPLEX16 #ifdef BUILD_COMPLEX16
int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG); int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG);
#endif #endif
} gotoblas_t; } gotoblas_t;
@ -1104,16 +1076,16 @@ extern gotoblas_t *gotoblas;
#define HAVE_EX_L2 gotoblas -> exclusive_cache #define HAVE_EX_L2 gotoblas -> exclusive_cache
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
#define SHGEMM_P gotoblas -> shgemm_p #define SBGEMM_P gotoblas -> sbgemm_p
#define SHGEMM_Q gotoblas -> shgemm_q #define SBGEMM_Q gotoblas -> sbgemm_q
#define SHGEMM_R gotoblas -> shgemm_r #define SBGEMM_R gotoblas -> sbgemm_r
#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m #define SBGEMM_UNROLL_M gotoblas -> sbgemm_unroll_m
#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n #define SBGEMM_UNROLL_N gotoblas -> sbgemm_unroll_n
#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn #define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn
#endif #endif
#if (BUILD_SINGLE) #if defined (BUILD_SINGLE)
#define SGEMM_P gotoblas -> sgemm_p #define SGEMM_P gotoblas -> sgemm_p
#define SGEMM_Q gotoblas -> sgemm_q #define SGEMM_Q gotoblas -> sgemm_q
#define SGEMM_R gotoblas -> sgemm_r #define SGEMM_R gotoblas -> sgemm_r
@ -1122,21 +1094,13 @@ extern gotoblas_t *gotoblas;
#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn #define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn
#endif #endif
#if (BUILD_DOUBLE) #if defined (BUILD_DOUBLE)
#define DGEMM_P gotoblas -> dgemm_p #define DGEMM_P gotoblas -> dgemm_p
#define DGEMM_Q gotoblas -> dgemm_q #define DGEMM_Q gotoblas -> dgemm_q
#define DGEMM_R gotoblas -> dgemm_r #define DGEMM_R gotoblas -> dgemm_r
#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m #define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m
#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n
#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn
#if ! (BUILD_SINGLE)
#define SGEMM_P gotoblas -> sgemm_p
#define SGEMM_Q gotoblas -> sgemm_q
#define SGEMM_R gotoblas -> sgemm_r
#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m
#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n
#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn
#endif
#endif #endif
#define QGEMM_P gotoblas -> qgemm_p #define QGEMM_P gotoblas -> qgemm_p
@ -1146,7 +1110,7 @@ extern gotoblas_t *gotoblas;
#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n #define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n
#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn #define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn
#if BUILD_COMPLEX #ifdef BUILD_COMPLEX
#define CGEMM_P gotoblas -> cgemm_p #define CGEMM_P gotoblas -> cgemm_p
#define CGEMM_Q gotoblas -> cgemm_q #define CGEMM_Q gotoblas -> cgemm_q
#define CGEMM_R gotoblas -> cgemm_r #define CGEMM_R gotoblas -> cgemm_r
@ -1163,7 +1127,7 @@ extern gotoblas_t *gotoblas;
#endif #endif
#endif #endif
#if BUILD_COMPLEX16 #ifdef BUILD_COMPLEX16
#define ZGEMM_P gotoblas -> zgemm_p #define ZGEMM_P gotoblas -> zgemm_p
#define ZGEMM_Q gotoblas -> zgemm_q #define ZGEMM_Q gotoblas -> zgemm_q
#define ZGEMM_R gotoblas -> zgemm_r #define ZGEMM_R gotoblas -> zgemm_r
@ -1178,14 +1142,6 @@ extern gotoblas_t *gotoblas;
#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n #define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n
#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn #define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn
#endif #endif
#ifndef BUILD_COMPLEX
#define CGEMM_P gotoblas -> cgemm_p
#define CGEMM_Q gotoblas -> cgemm_q
#define CGEMM_R gotoblas -> cgemm_r
#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m
#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n
#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn
#endif
#endif #endif
#define XGEMM_P gotoblas -> xgemm_p #define XGEMM_P gotoblas -> xgemm_p
@ -1230,16 +1186,16 @@ extern gotoblas_t *gotoblas;
#define HAVE_EX_L2 0 #define HAVE_EX_L2 0
#endif #endif
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
#define SHGEMM_P SHGEMM_DEFAULT_P #define SBGEMM_P SBGEMM_DEFAULT_P
#define SHGEMM_Q SHGEMM_DEFAULT_Q #define SBGEMM_Q SBGEMM_DEFAULT_Q
#define SHGEMM_R SHGEMM_DEFAULT_R #define SBGEMM_R SBGEMM_DEFAULT_R
#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M #define SBGEMM_UNROLL_M SBGEMM_DEFAULT_UNROLL_M
#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N #define SBGEMM_UNROLL_N SBGEMM_DEFAULT_UNROLL_N
#ifdef SHGEMM_DEFAULT_UNROLL_MN #ifdef SBGEMM_DEFAULT_UNROLL_MN
#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN #define SBGEMM_UNROLL_MN SBGEMM_DEFAULT_UNROLL_MN
#else #else
#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N)) #define SBGEMM_UNROLL_MN MAX((SBGEMM_UNROLL_M), (SBGEMM_UNROLL_N))
#endif #endif
#endif #endif
@ -1354,7 +1310,7 @@ extern gotoblas_t *gotoblas;
#endif #endif
#ifndef COMPLEX #ifndef COMPLEX
#if (XDOUBLE) #if defined(XDOUBLE)
#define GEMM_P QGEMM_P #define GEMM_P QGEMM_P
#define GEMM_Q QGEMM_Q #define GEMM_Q QGEMM_Q
#define GEMM_R QGEMM_R #define GEMM_R QGEMM_R
@ -1378,18 +1334,18 @@ extern gotoblas_t *gotoblas;
#define GEMM_DEFAULT_R DGEMM_DEFAULT_R #define GEMM_DEFAULT_R DGEMM_DEFAULT_R
#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M
#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N #define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N
#elif (HALF) #elif defined(BFLOAT16)
#define GEMM_P SHGEMM_P #define GEMM_P SBGEMM_P
#define GEMM_Q SHGEMM_Q #define GEMM_Q SBGEMM_Q
#define GEMM_R SHGEMM_R #define GEMM_R SBGEMM_R
#define GEMM_UNROLL_M SHGEMM_UNROLL_M #define GEMM_UNROLL_M SBGEMM_UNROLL_M
#define GEMM_UNROLL_N SHGEMM_UNROLL_N #define GEMM_UNROLL_N SBGEMM_UNROLL_N
#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN #define GEMM_UNROLL_MN SBGEMM_UNROLL_MN
#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P #define GEMM_DEFAULT_P SBGEMM_DEFAULT_P
#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q #define GEMM_DEFAULT_Q SBGEMM_DEFAULT_Q
#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R #define GEMM_DEFAULT_R SBGEMM_DEFAULT_R
#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M #define GEMM_DEFAULT_UNROLL_M SBGEMM_DEFAULT_UNROLL_M
#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N #define GEMM_DEFAULT_UNROLL_N SBGEMM_DEFAULT_UNROLL_N
#else #else
#define GEMM_P SGEMM_P #define GEMM_P SGEMM_P
#define GEMM_Q SGEMM_Q #define GEMM_Q SGEMM_Q
@ -1404,7 +1360,7 @@ extern gotoblas_t *gotoblas;
#define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N #define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N
#endif #endif
#else #else
#if (XDOUBLE) #if defined(XDOUBLE)
#define GEMM_P XGEMM_P #define GEMM_P XGEMM_P
#define GEMM_Q XGEMM_Q #define GEMM_Q XGEMM_Q
#define GEMM_R XGEMM_R #define GEMM_R XGEMM_R
@ -1475,8 +1431,8 @@ extern gotoblas_t *gotoblas;
#define GEMM_THREAD gemm_thread_n #define GEMM_THREAD gemm_thread_n
#endif #endif
#ifndef SHGEMM_DEFAULT_R #ifndef SBGEMM_DEFAULT_R
#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL) #define SBGEMM_DEFAULT_R (((BUFFER_SIZE - ((SBGEMM_DEFAULT_P * SBGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SBGEMM_DEFAULT_Q * 4) - 15) & ~15UL)
#endif #endif
#ifndef SGEMM_DEFAULT_R #ifndef SGEMM_DEFAULT_R

77
common_sb.h Normal file
View File

@ -0,0 +1,77 @@
#ifndef COMMON_SB_H
#define COMMON_SB_H
#ifndef DYNAMIC_ARCH
#define SBDOT_K sbdot_k
#define SBSTOBF16_K sbstobf16_k
#define SBDTOBF16_K sbdtobf16_k
#define SBF16TOS_K sbf16tos_k
#define DBF16TOD_K dbf16tod_k
#define SBGEMM_ONCOPY sbgemm_oncopy
#define SBGEMM_OTCOPY sbgemm_otcopy
#if SBGEMM_DEFAULT_UNROLL_M == SBGEMM_DEFAULT_UNROLL_N
#define SBGEMM_INCOPY sbgemm_oncopy
#define SBGEMM_ITCOPY sbgemm_otcopy
#else
#define SBGEMM_INCOPY sbgemm_incopy
#define SBGEMM_ITCOPY sbgemm_itcopy
#endif
#define SBGEMM_BETA sbgemm_beta
#define SBGEMM_KERNEL sbgemm_kernel
#else
#define SBDOT_K gotoblas -> sbdot_k
#define SBSTOBF16_K gotoblas -> sbstobf16_k
#define SBDTOBF16_K gotoblas -> sbdtobf16_k
#define SBF16TOS_K gotoblas -> sbf16tos_k
#define DBF16TOD_K gotoblas -> dbf16tod_k
#define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy
#define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy
#define SBGEMM_INCOPY gotoblas -> sbgemm_incopy
#define SBGEMM_ITCOPY gotoblas -> sbgemm_itcopy
#define SBGEMM_BETA gotoblas -> sbgemm_beta
#define SBGEMM_KERNEL gotoblas -> sbgemm_kernel
#endif
#define SBGEMM_NN sbgemm_nn
#define SBGEMM_CN sbgemm_tn
#define SBGEMM_TN sbgemm_tn
#define SBGEMM_NC sbgemm_nt
#define SBGEMM_NT sbgemm_nt
#define SBGEMM_CC sbgemm_tt
#define SBGEMM_CT sbgemm_tt
#define SBGEMM_TC sbgemm_tt
#define SBGEMM_TT sbgemm_tt
#define SBGEMM_NR sbgemm_nn
#define SBGEMM_TR sbgemm_tn
#define SBGEMM_CR sbgemm_tn
#define SBGEMM_RN sbgemm_nn
#define SBGEMM_RT sbgemm_nt
#define SBGEMM_RC sbgemm_nt
#define SBGEMM_RR sbgemm_nn
#define SBGEMM_THREAD_NN sbgemm_thread_nn
#define SBGEMM_THREAD_CN sbgemm_thread_tn
#define SBGEMM_THREAD_TN sbgemm_thread_tn
#define SBGEMM_THREAD_NC sbgemm_thread_nt
#define SBGEMM_THREAD_NT sbgemm_thread_nt
#define SBGEMM_THREAD_CC sbgemm_thread_tt
#define SBGEMM_THREAD_CT sbgemm_thread_tt
#define SBGEMM_THREAD_TC sbgemm_thread_tt
#define SBGEMM_THREAD_TT sbgemm_thread_tt
#define SBGEMM_THREAD_NR sbgemm_thread_nn
#define SBGEMM_THREAD_TR sbgemm_thread_tn
#define SBGEMM_THREAD_CR sbgemm_thread_tn
#define SBGEMM_THREAD_RN sbgemm_thread_nn
#define SBGEMM_THREAD_RT sbgemm_thread_nt
#define SBGEMM_THREAD_RC sbgemm_thread_nt
#define SBGEMM_THREAD_RR sbgemm_thread_nn
#endif

View File

@ -1,77 +0,0 @@
#ifndef COMMON_SH_H
#define COMMON_SH_H
#ifndef DYNAMIC_ARCH
#define SHDOT_K shdot_k
#define SHSTOBF16_K shstobf16_k
#define SHDTOBF16_K shdtobf16_k
#define SBF16TOS_K sbf16tos_k
#define DBF16TOD_K dbf16tod_k
#define SHGEMM_ONCOPY shgemm_oncopy
#define SHGEMM_OTCOPY shgemm_otcopy
#if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N
#define SHGEMM_INCOPY shgemm_oncopy
#define SHGEMM_ITCOPY shgemm_otcopy
#else
#define SHGEMM_INCOPY shgemm_incopy
#define SHGEMM_ITCOPY shgemm_itcopy
#endif
#define SHGEMM_BETA shgemm_beta
#define SHGEMM_KERNEL shgemm_kernel
#else
#define SHDOT_K gotoblas -> shdot_k
#define SHSTOBF16_K gotoblas -> shstobf16_k
#define SHDTOBF16_K gotoblas -> shdtobf16_k
#define SBF16TOS_K gotoblas -> sbf16tos_k
#define DBF16TOD_K gotoblas -> dbf16tod_k
#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy
#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy
#define SHGEMM_INCOPY gotoblas -> shgemm_incopy
#define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy
#define SHGEMM_BETA gotoblas -> shgemm_beta
#define SHGEMM_KERNEL gotoblas -> shgemm_kernel
#endif
#define SHGEMM_NN shgemm_nn
#define SHGEMM_CN shgemm_tn
#define SHGEMM_TN shgemm_tn
#define SHGEMM_NC shgemm_nt
#define SHGEMM_NT shgemm_nt
#define SHGEMM_CC shgemm_tt
#define SHGEMM_CT shgemm_tt
#define SHGEMM_TC shgemm_tt
#define SHGEMM_TT shgemm_tt
#define SHGEMM_NR shgemm_nn
#define SHGEMM_TR shgemm_tn
#define SHGEMM_CR shgemm_tn
#define SHGEMM_RN shgemm_nn
#define SHGEMM_RT shgemm_nt
#define SHGEMM_RC shgemm_nt
#define SHGEMM_RR shgemm_nn
#define SHGEMM_THREAD_NN shgemm_thread_nn
#define SHGEMM_THREAD_CN shgemm_thread_tn
#define SHGEMM_THREAD_TN shgemm_thread_tn
#define SHGEMM_THREAD_NC shgemm_thread_nt
#define SHGEMM_THREAD_NT shgemm_thread_nt
#define SHGEMM_THREAD_CC shgemm_thread_tt
#define SHGEMM_THREAD_CT shgemm_thread_tt
#define SHGEMM_THREAD_TC shgemm_thread_tt
#define SHGEMM_THREAD_TT shgemm_thread_tt
#define SHGEMM_THREAD_NR shgemm_thread_nn
#define SHGEMM_THREAD_TR shgemm_thread_tn
#define SHGEMM_THREAD_CR shgemm_thread_tn
#define SHGEMM_THREAD_RN shgemm_thread_nn
#define SHGEMM_THREAD_RT shgemm_thread_nt
#define SHGEMM_THREAD_RC shgemm_thread_nt
#define SHGEMM_THREAD_RR shgemm_thread_nn
#endif

View File

@ -19,8 +19,8 @@ ifeq ($(ARCH), MIPS)
USE_GEMM3M = 1 USE_GEMM3M = 1
endif endif
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX) SHBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX)
endif endif
SBLASOBJS += \ SBLASOBJS += \
@ -207,8 +207,8 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(
COMMONOBJS += syrk_thread.$(SUFFIX) COMMONOBJS += syrk_thread.$(SUFFIX)
ifndef USE_SIMPLE_THREADED_LEVEL3 ifndef USE_SIMPLE_THREADED_LEVEL3
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX) SHBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX)
endif endif
SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX) SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX) DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
@ -343,16 +343,16 @@ endif
all :: all ::
shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h sbgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h sbgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h sbgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
@ -550,16 +550,16 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h
beta_thread.$(SUFFIX) : beta_thread.c ../../common.h beta_thread.$(SUFFIX) : beta_thread.c ../../common.h
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) $(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
@ -2735,16 +2735,16 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c
xtrsm_RCLN.$(SUFFIX) : trsm_R.c xtrsm_RCLN.$(SUFFIX) : trsm_R.c
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F) $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F)
shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h sbgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h sbgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h sbgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
@ -2943,16 +2943,16 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h
$(CC) -c $(PFLAGS) $< -o $(@F) $(CC) -c $(PFLAGS) $< -o $(@F)
shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F) $(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h

View File

@ -227,7 +227,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
/* REAL / BFLOAT16 */ /* REAL / BFLOAT16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,

View File

@ -192,7 +192,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
/* REAL / BFLOAT16 */ /* REAL / BFLOAT16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,

View File

@ -112,7 +112,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){ } else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
/* REAL / BFLOAT16 */ /* REAL / BFLOAT16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,

View File

@ -62,10 +62,10 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
BLASLONG gemm_offset_b = GEMM_OFFSET_B; BLASLONG gemm_offset_b = GEMM_OFFSET_B;
#endif #endif
#if SHGEMM_P == shgemm_p #if SBGEMM_P == sbgemm_p
BLASLONG shgemm_p = DEFAULT_GEMM_P; BLASLONG sbgemm_p = DEFAULT_GEMM_P;
#else #else
BLASLONG shgemm_p = SHGEMM_P; BLASLONG sbgemm_p = SBGEMM_P;
#endif #endif
#if SGEMM_P == sgemm_p #if SGEMM_P == sgemm_p
BLASLONG sgemm_p = DEFAULT_GEMM_P; BLASLONG sgemm_p = DEFAULT_GEMM_P;
@ -88,10 +88,10 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P;
BLASLONG zgemm_p = ZGEMM_P; BLASLONG zgemm_p = ZGEMM_P;
#endif #endif
#if SHGEMM_Q == shgemm_q #if SBGEMM_Q == sbgemm_q
BLASLONG shgemm_q = DEFAULT_GEMM_Q; BLASLONG sbgemm_q = DEFAULT_GEMM_Q;
#else #else
BLASLONG shgemm_q = SHGEMM_Q; BLASLONG sbgemm_q = SBGEMM_Q;
#endif #endif
#if SGEMM_Q == sgemm_q #if SGEMM_Q == sgemm_q
BLASLONG sgemm_q = DEFAULT_GEMM_Q; BLASLONG sgemm_q = DEFAULT_GEMM_Q;
@ -114,10 +114,10 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q;
BLASLONG zgemm_q = ZGEMM_Q; BLASLONG zgemm_q = ZGEMM_Q;
#endif #endif
#if SHGEMM_R == shgemm_r #if SBGEMM_R == sbgemm_r
BLASLONG shgemm_r = DEFAULT_GEMM_R; BLASLONG sbgemm_r = DEFAULT_GEMM_R;
#else #else
BLASLONG shgemm_r = SHGEMM_R; BLASLONG sbgemm_r = SBGEMM_R;
#endif #endif
#if SGEMM_R == sgemm_r #if SGEMM_R == sgemm_r
BLASLONG sgemm_r = DEFAULT_GEMM_R; BLASLONG sgemm_r = DEFAULT_GEMM_R;
@ -615,7 +615,7 @@ void blas_set_parameter(void){
size = BITMASK(cpuid3, 16, 0xff); size = BITMASK(cpuid3, 16, 0xff);
shgemm_p = 192 * (size + 1); sbgemm_p = 192 * (size + 1);
sgemm_p = 192 * (size + 1); sgemm_p = 192 * (size + 1);
dgemm_p = 96 * (size + 1); dgemm_p = 96 * (size + 1);
cgemm_p = 96 * (size + 1); cgemm_p = 96 * (size + 1);
@ -629,7 +629,7 @@ void blas_set_parameter(void){
xgemm_p = 16 * (size + 1); xgemm_p = 16 * (size + 1);
#endif #endif
shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15; sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15; sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15; dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15; cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;

View File

@ -30,8 +30,8 @@ ifndef BUILD_LAPACK_DEPRECATED
BUILD_LAPACK_DEPRECATED = 0 BUILD_LAPACK_DEPRECATED = 0
endif endif
ifndef BUILD_HALF ifndef BUILD_BFLOAT16
BUILD_HALF = 0 BUILD_BFLOAT16 = 0
endif endif
ifndef BUILD_SINGLE ifndef BUILD_SINGLE
BUILD_SINGLE = 0 BUILD_SINGLE = 0
@ -120,10 +120,10 @@ dll : ../$(LIBDLLNAME)
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB) -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
$(LIBPREFIX).def : gensymbol $(LIBPREFIX).def : gensymbol
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
libgoto_hpl.def : gensymbol libgoto_hpl.def : gensymbol
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
ifeq ($(OSNAME), Darwin) ifeq ($(OSNAME), Darwin)
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
@ -258,23 +258,23 @@ static : ../$(LIBNAME)
rm -f goto.$(SUFFIX) rm -f goto.$(SUFFIX)
osx.def : gensymbol ../Makefile.system ../getarch.c osx.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
aix.def : gensymbol ../Makefile.system ../getarch.c aix.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
objcopy.def : gensymbol ../Makefile.system ../getarch.c objcopy.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
objconv.def : gensymbol ../Makefile.system ../getarch.c objconv.def : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F) perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
test : linktest.c test : linktest.c
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK. $(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
rm -f linktest rm -f linktest
linktest.c : gensymbol ../Makefile.system ../getarch.c linktest.c : gensymbol ../Makefile.system ../getarch.c
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
clean :: clean ::
@rm -f *.def *.dylib __.SYMDEF* *.renamed @rm -f *.def *.dylib __.SYMDEF* *.renamed

View File

@ -51,7 +51,7 @@
zgeadd, dzsum); zgeadd, dzsum);
@cblasobjs = (lsame, xerbla); @cblasobjs = (lsame, xerbla);
@halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod); @halfblasobjs = (sbgemm, sbdot, shstobf16, shdtobf16, sbf16tos, dbf16tod);
@cblasobjsc = ( @cblasobjsc = (
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@ -94,7 +94,7 @@
@cblasobjs = ( cblas_xerbla ); @cblasobjs = ( cblas_xerbla );
@halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod); @halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod);
@exblasobjs = ( @exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,

View File

@ -9,8 +9,8 @@
int main(int argc, char **argv) { int main(int argc, char **argv) {
if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) { if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) {
printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M); printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M);
printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N); printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N);
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M); printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N); printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M); printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);

View File

@ -46,10 +46,10 @@ SBLAS3OBJS = \
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\ somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
sgeadd.$(SUFFIX) sgeadd.$(SUFFIX)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
SHBLAS1OBJS = shdot.$(SUFFIX) SBBLAS1OBJS = sbdot.$(SUFFIX)
SHBLAS3OBJS = shgemm.$(SUFFIX) SBBLAS3OBJS = sbgemm.$(SUFFIX)
SHEXTOBJS = shstobf16.$(SUFFIX) shdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
endif endif
DBLAS1OBJS = \ DBLAS1OBJS = \
@ -282,10 +282,10 @@ CSBLAS3OBJS = \
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\ cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(SUFFIX)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
CSHBLAS1OBJS = cblas_shdot.$(SUFFIX) CBHBLAS1OBJS = cblas_sbdot.$(SUFFIX)
CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) CBHBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
CSHEXTOBJS = cblas_shstobf16.$(SUFFIX) cblas_shdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) CBHEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
endif endif
CDBLAS1OBJS = \ CDBLAS1OBJS = \
@ -381,8 +381,8 @@ override CFLAGS += -I.
SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS1OBJS += $(CSBLAS1OBJS)
SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS2OBJS += $(CSBLAS2OBJS)
SBLAS3OBJS += $(CSBLAS3OBJS) SBLAS3OBJS += $(CSBLAS3OBJS)
SHBLAS1OBJS += $(CSHBLAS1OBJS) SBBLAS1OBJS += $(CSBBLAS1OBJS)
SHBLAS3OBJS += $(CSHBLAS3OBJS) SBBLAS3OBJS += $(CSBBLAS3OBJS)
DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS1OBJS += $(CDBLAS1OBJS)
DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS2OBJS += $(CDBLAS2OBJS)
DBLAS3OBJS += $(CDBLAS3OBJS) DBLAS3OBJS += $(CDBLAS3OBJS)
@ -393,13 +393,13 @@ ZBLAS1OBJS += $(CZBLAS1OBJS)
ZBLAS2OBJS += $(CZBLAS2OBJS) ZBLAS2OBJS += $(CZBLAS2OBJS)
ZBLAS3OBJS += $(CZBLAS3OBJS) ZBLAS3OBJS += $(CZBLAS3OBJS)
SHEXTOBJS += $(CSHEXTOBJS) SBEXTOBJS += $(CSBEXTOBJS)
CBAUXOBJS += $(CXERBLAOBJ) CBAUXOBJS += $(CXERBLAOBJ)
endif endif
SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
SHBLASOBJS = $(SHBLAS1OBJS) $(SHBLAS3OBJS) SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS)
DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
@ -506,7 +506,7 @@ ifneq ($(BUILD_COMPLEX16),1)
ZBLASOBJS= ZBLASOBJS=
endif endif
FUNCOBJS = $(SHEXTOBJS) $(CXERBLAOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
$(info FUNCOBJS = {[$(FUNCOBJS)]} ) $(info FUNCOBJS = {[$(FUNCOBJS)]} )
ifdef EXPRECISION ifdef EXPRECISION
FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS)
@ -772,8 +772,8 @@ sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c
dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
shdot.$(SUFFIX) shdot.$(PSUFFIX) : bf16dot.c sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)
shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c
$(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) $(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F)
@ -1278,8 +1278,8 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c
xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -c $(CFLAGS) $< -o $(@F) $(CC) -c $(CFLAGS) $< -o $(@F)
endif endif
@ -1523,8 +1523,8 @@ cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
cblas_shdot.$(SUFFIX) cblas_shdot.$(PSUFFIX) : bf16dot.c cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c
$(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F)
@ -1857,8 +1857,8 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c
cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
endif endif

View File

@ -41,8 +41,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
foreach (float_type ${FLOAT_TYPES}) foreach (float_type ${FLOAT_TYPES})
# a bit of metaprogramming here to pull out the appropriate KERNEL var # a bit of metaprogramming here to pull out the appropriate KERNEL var
string(SUBSTRING ${float_type} 0 1 float_char) string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "HALF") if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SH") set (float_char "SB")
endif () endif ()
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type})
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type})
@ -149,8 +149,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3) GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
foreach (float_type ${FLOAT_TYPES}) foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char) string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "HALF") if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SH") set (float_char "SB")
endif () endif ()
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX") if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
@ -208,13 +208,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE) GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
endif() endif()
foreach (float_type SINGLE DOUBLE HALF) foreach (float_type SINGLE DOUBLE BFLOAT16)
string(SUBSTRING ${float_type} 0 1 float_char) string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "HALF") if (${float_type} STREQUAL "BFLOAT16")
if (NOT ${BUILD_HALF}) if (NOT ${BUILD_BFLOAT16})
continue () continue ()
else () else ()
set (float_char "SH") set (float_char "SB")
endif () endif ()
endif () endif ()
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
@ -254,8 +254,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
foreach (float_type ${FLOAT_TYPES}) foreach (float_type ${FLOAT_TYPES})
string(SUBSTRING ${float_type} 0 1 float_char) string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "HALF") if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SH") set (float_char "SB")
endif () endif ()
if (${float_char}GEMMINCOPY) if (${float_char}GEMMINCOPY)
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type}) GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
@ -620,8 +620,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
# Makefile.LA # Makefile.LA
if(NOT NO_LAPACK) if(NOT NO_LAPACK)
foreach (float_type ${FLOAT_TYPES}) foreach (float_type ${FLOAT_TYPES})
if (${float_type} STREQUAL "HALF") if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SH") set (float_char "SB")
endif () endif ()
if (NOT DEFINED ${float_char}NEG_TCOPY) if (NOT DEFINED ${float_char}NEG_TCOPY)
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X") if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X")
@ -688,8 +688,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
foreach (float_type ${FLOAT_TYPES}) foreach (float_type ${FLOAT_TYPES})
# a bit of metaprogramming here to pull out the appropriate KERNEL var # a bit of metaprogramming here to pull out the appropriate KERNEL var
string(SUBSTRING ${float_type} 0 1 float_char) string(SUBSTRING ${float_type} 0 1 float_char)
if (${float_type} STREQUAL "HALF") if (${float_type} STREQUAL "BFLOAT16")
set (float_char "SH") set (float_char "SB")
endif () endif ()
GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type}) GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type})
GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type}) GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type})

View File

@ -262,9 +262,9 @@ ifndef XDOTKERNEL
XDOTKERNEL = zdot.S XDOTKERNEL = zdot.S
endif endif
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
ifndef SHDOTKERNEL ifndef SBDOTKERNEL
SHDOTKERNEL = ../x86_64/shdot.c SBDOTKERNEL = ../x86_64/sbdot.c
endif endif
ifndef TOBF16KERNEL ifndef TOBF16KERNEL
@ -530,11 +530,11 @@ XBLASOBJS += \
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
SHBLASOBJS += \ SHBLASOBJS += \
shdot_k$(TSUFFIX).$(SUFFIX) sbdot_k$(TSUFFIX).$(SUFFIX)
SHEXTOBJS += \ SHEXTOBJS += \
shstobf16_k$(TSUFFIX).$(SUFFIX) shdtobf16_k$(TSUFFIX).$(SUFFIX) sbstobf16_k$(TSUFFIX).$(SUFFIX) sbdtobf16_k$(TSUFFIX).$(SUFFIX)
SHEXTOBJS += \ SHEXTOBJS += \
sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX) sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX)
endif endif
@ -757,12 +757,12 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
$(KDIR)shdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)shdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHDOTKERNEL) $(KDIR)sbdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sbdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
$(KDIR)shstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) $(KDIR)sbstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
$(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@
$(KDIR)shdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL) $(KDIR)sbdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
$(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@ $(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@
$(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL) $(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL)
$(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@ $(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@

View File

@ -80,24 +80,24 @@ SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
endif endif
endif endif
ifeq ($(BUILD_HALF), 1) ifeq ($(BUILD_BFLOAT16), 1)
ifndef SHGEMMKERNEL ifndef SBGEMMKERNEL
SHGEMM_BETA = ../generic/gemm_beta.c SBGEMM_BETA = ../generic/gemm_beta.c
SHGEMMKERNEL = ../generic/gemmkernel_2x2.c SBGEMMKERNEL = ../generic/gemmkernel_2x2.c
SHGEMMINCOPY = ../generic/gemm_ncopy_2.c SBGEMMINCOPY = ../generic/gemm_ncopy_2.c
SHGEMMITCOPY = ../generic/gemm_tcopy_2.c SBGEMMITCOPY = ../generic/gemm_tcopy_2.c
SHGEMMONCOPY = ../generic/gemm_ncopy_2.c SBGEMMONCOPY = ../generic/gemm_ncopy_2.c
SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif endif
SHKERNELOBJS += \ SHKERNELOBJS += \
shgemm_kernel$(TSUFFIX).$(SUFFIX) \ sbgemm_kernel$(TSUFFIX).$(SUFFIX) \
$(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \ $(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \
$(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ) $(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ)
endif endif
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" "" ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
@ -149,7 +149,7 @@ XKERNELOBJS += \
$(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \ $(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \
$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ) $(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
SHBLASOBJS += $(SHKERNELOBJS) SHBLASOBJS += $(SHKERNELOBJS)
endif endif
SBLASOBJS += $(SKERNELOBJS) SBLASOBJS += $(SKERNELOBJS)
@ -159,8 +159,8 @@ CBLASOBJS += $(CKERNELOBJS)
ZBLASOBJS += $(ZKERNELOBJS) ZBLASOBJS += $(ZKERNELOBJS)
XBLASOBJS += $(XKERNELOBJS) XBLASOBJS += $(XKERNELOBJS)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX) SHBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
endif endif
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" "" ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" ""
@ -492,11 +492,11 @@ ZBLASOBJS += \
zgeadd_k$(TSUFFIX).$(SUFFIX) zgeadd_k$(TSUFFIX).$(SUFFIX)
endif endif
ifeq ($(BUILD_HALF), 1) ifeq ($(BUILD_BFLOAT16), 1)
SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
endif endif
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
@ -524,9 +524,9 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX)) XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) $(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif
$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
@ -548,35 +548,35 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
ifeq ($(BUILD_HALF), 1) ifeq ($(BUILD_BFLOAT16), 1)
$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY) $(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY)
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY) $(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY)
ifeq ($(OS), AIX) ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s
m4 shgemmotcopy.s > shgemmotcopy_nomacros.s m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@
rm shgemmotcopy.s shgemmotcopy_nomacros.s rm sbgemmotcopy.s sbgemmotcopy_nomacros.s
else else
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY) $(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY)
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY) $(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
ifeq ($(OS), AIX) ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s
m4 shgemmitcopy.s > shgemmitcopy_nomacros.s m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@
rm shgemmitcopy.s shgemmitcopy_nomacros.s rm sbgemmitcopy.s sbgemmitcopy_nomacros.s
else else
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif
endif endif
@ -746,16 +746,16 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
endif endif
ifeq ($(BUILD_HALF), 1) ifeq ($(BUILD_BFLOAT16), 1)
$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) $(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
ifeq ($(OS), AIX) ifeq ($(OS), AIX)
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s $(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s
m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@
rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s
else else
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif
endif endif
@ -2375,9 +2375,9 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_
$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA) $(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA) $(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif
$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA) $(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
@ -2396,19 +2396,19 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
ifeq ($(BUILD_HALF), 1) ifeq ($(BUILD_BFLOAT16), 1)
$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY) $(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY)
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY) $(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY)
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N)) ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY) $(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY)
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY) $(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY)
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif
endif endif
@ -2518,9 +2518,9 @@ endif
endif endif
ifeq ($(BUILD_HALF), 1) ifeq ($(BUILD_BFLOAT16), 1)
$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND) $(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@ $(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
endif endif
$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND) $(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)

View File

@ -1,5 +1,5 @@
#include "common.h" #include "common.h"
#if defined(HALF) && defined(HALFCONVERSION) #if defined(BFLOAT16) && defined(BFLOAT16CONVERSION)
static float static float
bfloat16tof32 (bfloat16 f16) bfloat16tof32 (bfloat16 f16)
{ {

View File

@ -7,16 +7,16 @@ else
#CGEMM_BETA = ../generic/zgemm_beta.c #CGEMM_BETA = ../generic/zgemm_beta.c
#ZGEMM_BETA = ../generic/zgemm_beta.c #ZGEMM_BETA = ../generic/zgemm_beta.c
SHGEMM_BETA = ../generic/gemm_beta.c SBGEMM_BETA = ../generic/gemm_beta.c
SHGEMMKERNEL = shgemm_kernel_power10.c SBGEMMKERNEL = sbgemm_kernel_power10.c
SHGEMMINCOPY = ../generic/gemm_ncopy_16.c SBGEMMINCOPY = ../generic/gemm_ncopy_16.c
SHGEMMITCOPY = ../generic/gemm_tcopy_16.c SBGEMMITCOPY = ../generic/gemm_tcopy_16.c
SHGEMMONCOPY = ../generic/gemm_ncopy_8.c SBGEMMONCOPY = ../generic/gemm_ncopy_8.c
SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c SBGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX) SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX) SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX) SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX) SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRMMKERNEL = sgemm_kernel_power10.c STRMMKERNEL = sgemm_kernel_power10.c
DTRMMKERNEL = dgemm_kernel_power10.c DTRMMKERNEL = dgemm_kernel_power10.c

View File

@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
#include "common.h" #include "common.h"
#include <altivec.h> #include <altivec.h>
#if defined(HALF) && defined(HALFCONVERSION) #if defined(BFLOAT16) && defined(BFLOAT16CONVERSION)
static float static float
bfloat16tof32 (bfloat16 f16) bfloat16tof32 (bfloat16 f16)
{ {
@ -131,7 +131,7 @@ vector char mask =
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory"); #define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
/************************************************************************************* /*************************************************************************************
* SHGEMM Kernel * SBGEMM Kernel
*************************************************************************************/ *************************************************************************************/
int int
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A, CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,

View File

@ -53,32 +53,32 @@ gotoblas_t TABLE_NAME = {
GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN, GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
0, 0, 0, 0, 0, 0,
SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N, SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N,
#ifdef SHGEMM_DEFAULT_UNROLL_MN #ifdef SBGEMM_DEFAULT_UNROLL_MN
SHGEMM_DEFAULT_UNROLL_MN, SBGEMM_DEFAULT_UNROLL_MN,
#else #else
MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N),
#endif #endif
shstobf16_kTS, shdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
samax_kTS, samin_kTS, smax_kTS, smin_kTS, samax_kTS, samin_kTS, smax_kTS, smin_kTS,
isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, shdot_kTS, snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS,
dsdot_kTS, dsdot_kTS,
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
sgemv_nTS, sgemv_tTS, sger_kTS, sgemv_nTS, sgemv_tTS, sger_kTS,
ssymv_LTS, ssymv_UTS, ssymv_LTS, ssymv_UTS,
shgemm_kernelTS, shgemm_betaTS, sbgemm_kernelTS, sbgemm_betaTS,
#if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N #if SBGEMM_DEFAULT_UNROLL_M != SBGEMM_DEFAULT_UNROLL_N
shgemm_incopyTS, shgemm_itcopyTS, sbgemm_incopyTS, sbgemm_itcopyTS,
#else #else
shgemm_oncopyTS, shgemm_otcopyTS, sbgemm_oncopyTS, sbgemm_otcopyTS,
#endif #endif
shgemm_oncopyTS, shgemm_otcopyTS, sbgemm_oncopyTS, sbgemm_otcopyTS,
strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS, strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS,
#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N #if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
@ -830,8 +830,8 @@ gotoblas_t TABLE_NAME = {
#if (ARCH_ARM64) #if (ARCH_ARM64)
static void init_parameter(void) { static void init_parameter(void) {
#if (BUILD_HALF) #if (BUILD_BFLOAT16)
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
#endif #endif
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
@ -846,8 +846,8 @@ static void init_parameter(void) {
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#endif #endif
#if (BUILD_HALF) #if (BUILD_BFLOAT16)
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
#endif #endif
#if BUILD_SINGLE == 1 #if BUILD_SINGLE == 1
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
@ -862,8 +862,8 @@ static void init_parameter(void) {
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
#endif #endif
#if (BUILD_HALF) #if (BUILD_BFLOAT16)
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
#endif #endif
#if BUILD_SINGLE == 1 #if BUILD_SINGLE == 1
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
@ -936,16 +936,16 @@ static void init_parameter(void) {
#if (ARCH_POWER) #if (ARCH_POWER)
static void init_parameter(void) { static void init_parameter(void) {
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
#endif #endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
#endif #endif
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
@ -953,8 +953,8 @@ static void init_parameter(void) {
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
#endif #endif
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
@ -965,16 +965,16 @@ static void init_parameter(void) {
#if (ARCH_ZARCH) #if (ARCH_ZARCH)
static void init_parameter(void) { static void init_parameter(void) {
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
#endif #endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
#endif #endif
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R; TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
@ -982,8 +982,8 @@ static void init_parameter(void) {
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
#endif #endif
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
@ -1124,10 +1124,10 @@ static void init_parameter(void) {
(void) l2; /* dirty trick to suppress unused variable warning for targets */ (void) l2; /* dirty trick to suppress unused variable warning for targets */
/* where the GEMM unrolling parameters do not depend on l2 */ /* where the GEMM unrolling parameters do not depend on l2 */
#ifdef BUILD_HALF #ifdef BUILD_BFLOAT16
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P; TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R; TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q; TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
#endif #endif
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1) #if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;

View File

@ -146,8 +146,8 @@ ifndef XDOTKERNEL
XDOTKERNEL = zdot.S XDOTKERNEL = zdot.S
endif endif
ifndef SHDOTKERNEL ifndef SBDOTKERNEL
SHDOTKERNEL = shdot.c SBDOTKERNEL = sbdot.c
endif endif
ifndef TOBF16KERNEL ifndef TOBF16KERNEL

View File

@ -28,16 +28,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "common.h" #include "common.h"
#if defined(COOPERLAKE) #if defined(COOPERLAKE)
#include "shdot_microk_cooperlake.c" #include "sbdot_microk_cooperlake.c"
#endif #endif
static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y) static float sbdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y)
{ {
float d = 0.0; float d = 0.0;
#ifdef HAVE_SHDOT_ACCL_KERNEL #ifdef HAVE_SBDOT_ACCL_KERNEL
if ((inc_x == 1) && (inc_y == 1)) { if ((inc_x == 1) && (inc_y == 1)) {
return shdot_accl_kernel(n, x, y); return sbdot_accl_kernel(n, x, y);
} }
#endif #endif
@ -56,11 +56,11 @@ static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y,
} }
#if defined(SMP) #if defined(SMP)
static int shdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2, static int sbdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2,
bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y,
float *result, BLASLONG dummy3) float *result, BLASLONG dummy3)
{ {
*(float *)result = shdot_compute(n, x, inc_x, y, inc_y); *(float *)result = sbdot_compute(n, x, inc_x, y, inc_y);
return 0; return 0;
} }
@ -94,13 +94,13 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y
} }
if (nthreads <= 1) { if (nthreads <= 1) {
dot_result = shdot_compute(n, x, inc_x, y, inc_y); dot_result = sbdot_compute(n, x, inc_x, y, inc_y);
} else { } else {
char thread_result[MAX_CPU_NUMBER * sizeof(double) * 2]; char thread_result[MAX_CPU_NUMBER * sizeof(double) * 2];
int mode = BLAS_BFLOAT16 | BLAS_REAL; int mode = BLAS_BFLOAT16 | BLAS_REAL;
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, thread_result, 0, x, inc_x, y, inc_y, thread_result, 0,
(void *)shdot_thread_func, nthreads); (void *)sbdot_thread_func, nthreads);
float * ptr = (float *)thread_result; float * ptr = (float *)thread_result;
for (int i = 0; i < nthreads; i++) { for (int i = 0; i < nthreads; i++) {
dot_result += (*ptr); dot_result += (*ptr);
@ -108,7 +108,7 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y
} }
} }
#else #else
dot_result = shdot_compute(n, x, inc_x, y, inc_y); dot_result = sbdot_compute(n, x, inc_x, y, inc_y);
#endif #endif
return dot_result; return dot_result;

View File

@ -28,11 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* need a new enough GCC for avx512 support */ /* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) #if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_SHDOT_ACCL_KERNEL 1 #define HAVE_SBDOT_ACCL_KERNEL 1
#include "common.h" #include "common.h"
#include <immintrin.h> #include <immintrin.h>
static float shdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y) static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
{ {
__m128 accum128 = _mm_setzero_ps(); __m128 accum128 = _mm_setzero_ps();
if (n> 127) { /* n range from 128 to inf. */ if (n> 127) { /* n range from 128 to inf. */

View File

@ -382,7 +382,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1; mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
#elif defined(HALF) #elif defined(HALF)
mode = BLAS_HALF | BLAS_REAL; mode = BLAS_HALF | BLAS_REAL;
mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1; mask = MAX(SBGEMM_UNROLL_M, SBGEMM_UNROLL_N) - 1;
#else #else
mode = BLAS_SINGLE | BLAS_REAL; mode = BLAS_SINGLE | BLAS_REAL;
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1; mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;

32
param.h
View File

@ -72,12 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef PARAM_H #ifndef PARAM_H
#define PARAM_H #define PARAM_H
#define SHGEMM_DEFAULT_UNROLL_N 4 #define SBGEMM_DEFAULT_UNROLL_N 4
#define SHGEMM_DEFAULT_UNROLL_M 8 #define SBGEMM_DEFAULT_UNROLL_M 8
#define SHGEMM_DEFAULT_UNROLL_MN 32 #define SBGEMM_DEFAULT_UNROLL_MN 32
#define SHGEMM_DEFAULT_P 256 #define SBGEMM_DEFAULT_P 256
#define SHGEMM_DEFAULT_R 256 #define SBGEMM_DEFAULT_R 256
#define SHGEMM_DEFAULT_Q 256 #define SBGEMM_DEFAULT_Q 256
#ifdef OPTERON #ifdef OPTERON
#define SNUMOPT 4 #define SNUMOPT 4
@ -2426,16 +2426,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(POWER10) #if defined(POWER10)
#undef SHGEMM_DEFAULT_UNROLL_N #undef SBGEMM_DEFAULT_UNROLL_N
#undef SHGEMM_DEFAULT_UNROLL_M #undef SBGEMM_DEFAULT_UNROLL_M
#undef SHGEMM_DEFAULT_P #undef SBGEMM_DEFAULT_P
#undef SHGEMM_DEFAULT_R #undef SBGEMM_DEFAULT_R
#undef SHGEMM_DEFAULT_Q #undef SBGEMM_DEFAULT_Q
#define SHGEMM_DEFAULT_UNROLL_M 16 #define SBGEMM_DEFAULT_UNROLL_M 16
#define SHGEMM_DEFAULT_UNROLL_N 8 #define SBGEMM_DEFAULT_UNROLL_N 8
#define SHGEMM_DEFAULT_P 832 #define SBGEMM_DEFAULT_P 832
#define SHGEMM_DEFAULT_Q 1026 #define SBGEMM_DEFAULT_Q 1026
#define SHGEMM_DEFAULT_R 4096 #define SBGEMM_DEFAULT_R 4096
#endif #endif
#if defined(SPARC) && defined(V7) #if defined(SPARC) && defined(V7)

View File

@ -214,16 +214,16 @@ endif
#ifeq ($(BUILD_HALF),1) #ifeq ($(BUILD_BFLOAT16),1)
#level3 : test_shgemm sblat3 dblat3 cblat3 zblat3 #level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3
#else #else
#level3 : sblat3 dblat3 cblat3 zblat3 #level3 : sblat3 dblat3 cblat3 zblat3
#endif #endif
ifndef CROSS ifndef CROSS
rm -f ?BLAT3.SUMM rm -f ?BLAT3.SUMM
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SHBLAT3.SUMM
@$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0
endif endif
ifeq ($(BUILD_SINGLE),1) ifeq ($(BUILD_SINGLE),1)
@ -245,8 +245,8 @@ endif
ifdef SMP ifdef SMP
rm -f ?BLAT3.SUMM rm -f ?BLAT3.SUMM
ifeq ($(USE_OPENMP), 1) ifeq ($(USE_OPENMP), 1)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM OMP_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM
@$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0
endif endif
ifeq ($(BUILD_SINGLE),1) ifeq ($(BUILD_SINGLE),1)
@ -266,8 +266,8 @@ ifeq ($(BUILD_COMPLEX16),1)
@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0 @$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
endif endif
else else
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM
@$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0 @$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0
endif endif
ifeq ($(BUILD_SINGLE),1) ifeq ($(BUILD_SINGLE),1)
@ -377,9 +377,9 @@ zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME)
$(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
endif endif
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_BFLOAT16),1)
test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME) test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
$(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB) $(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
endif endif
ifeq ($(BUILD_COMPLEX),1) ifeq ($(BUILD_COMPLEX),1)
@ -398,7 +398,7 @@ clean:
@rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \ @rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \
sblat1 dblat1 cblat1 zblat1 \ sblat1 dblat1 cblat1 zblat1 \
sblat2 dblat2 cblat2 zblat2 \ sblat2 dblat2 cblat2 zblat2 \
test_shgemm sblat3 dblat3 cblat3 zblat3 \ test_sbgemm sblat3 dblat3 cblat3 zblat3 \
sblat1p dblat1p cblat1p zblat1p \ sblat1p dblat1p cblat1p zblat1p \
sblat2p dblat2p cblat2p zblat2p \ sblat2p dblat2p cblat2p zblat2p \
sblat3p dblat3p cblat3p zblat3p \ sblat3p dblat3p cblat3p zblat3p \

View File

@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <stdint.h> #include <stdint.h>
#include "../common.h" #include "../common.h"
#define SGEMM BLASFUNC(sgemm) #define SGEMM BLASFUNC(sgemm)
#define SHGEMM BLASFUNC(shgemm) #define SBGEMM BLASFUNC(sbgemm)
typedef union typedef union
{ {
unsigned short v; unsigned short v;
@ -102,7 +102,7 @@ main (int argc, char *argv[])
} }
SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
&m, B, &k, &beta, C, &m); &m, B, &k, &beta, C, &m);
SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
&m, BB, &k, &beta, CC, &m); &m, BB, &k, &beta, CC, &m);
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
for (j = 0; j < m; j++) for (j = 0; j < m; j++)
@ -126,6 +126,6 @@ main (int argc, char *argv[])
} }
} }
if (ret != 0) if (ret != 0)
fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret); fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
return ret; return ret;
} }