Merge pull request #2886 from martin-frbg/issue_2767
Rename "HALF" precision functions (sh prefix) to "BFLOAT16" with "sb" prefix
This commit is contained in:
commit
08929430cd
|
@ -29,10 +29,8 @@ option(NO_AFFINITY "Disable support for CPU affinity masks to avoid binding proc
|
|||
else()
|
||||
set(NO_AFFINITY 1)
|
||||
endif()
|
||||
option(BUILD_SINGLE "Single precision" OFF)
|
||||
option(BUILD_DOUBLE "Double precision" OFF)
|
||||
option(BUILD_COMPLEX "Single precision" OFF)
|
||||
option(BUILD_COMPLEX16 "Single precision" OFF)
|
||||
option(CPP_THREAD_SAFETY_TEST "Run a massively parallel DGEMM test to confirm thread safety of the library (requires OpenMP and about 1.3GB of RAM)" OFF)
|
||||
option(CPP_THREAD_SAFETY_GEMV "Run a massively parallel DGEMV test to confirm thread safety of the library (requires OpenMP)" OFF)
|
||||
|
||||
# Add a prefix or suffix to all exported symbol names in the shared library.
|
||||
# Avoids conflicts with other BLAS libraries, especially when using
|
||||
|
@ -91,13 +89,13 @@ if (NOT NO_LAPACK)
|
|||
list(APPEND SUBDIRS lapack)
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED BUILD_HALF)
|
||||
set (BUILD_HALF false)
|
||||
if (NOT DEFINED BUILD_BFLOAT16)
|
||||
set (BUILD_BFLOAT16 false)
|
||||
endif ()
|
||||
# set which float types we want to build for
|
||||
if (NOT DEFINED BUILD_SINGLE AND NOT DEFINED BUILD_DOUBLE AND NOT DEFINED BUILD_COMPLEX AND NOT DEFINED BUILD_COMPLEX16)
|
||||
# if none are defined, build for all
|
||||
# set(BUILD_HALF true)
|
||||
# set(BUILD_BFLOAT16 true)
|
||||
set(BUILD_SINGLE true)
|
||||
set(BUILD_DOUBLE true)
|
||||
set(BUILD_COMPLEX true)
|
||||
|
@ -110,33 +108,28 @@ endif()
|
|||
|
||||
set(FLOAT_TYPES "")
|
||||
if (BUILD_SINGLE)
|
||||
message(STATUS "Building Songle Precision")
|
||||
list(APPEND FLOAT_TYPES "SINGLE")
|
||||
# set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1")
|
||||
message(STATUS "Building Single Precision")
|
||||
list(APPEND FLOAT_TYPES "SINGLE") # defines nothing
|
||||
endif ()
|
||||
|
||||
if (BUILD_DOUBLE)
|
||||
message(STATUS "Building Double Precision")
|
||||
list(APPEND FLOAT_TYPES "DOUBLE")
|
||||
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1")
|
||||
list(APPEND FLOAT_TYPES "DOUBLE") # defines DOUBLE
|
||||
endif ()
|
||||
|
||||
if (BUILD_COMPLEX)
|
||||
message(STATUS "Building Complex Precision")
|
||||
list(APPEND FLOAT_TYPES "COMPLEX")
|
||||
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1")
|
||||
list(APPEND FLOAT_TYPES "COMPLEX") # defines COMPLEX
|
||||
endif ()
|
||||
|
||||
if (BUILD_COMPLEX16)
|
||||
message(STATUS "Building Double Complex Precision")
|
||||
list(APPEND FLOAT_TYPES "ZCOMPLEX")
|
||||
#set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1")
|
||||
list(APPEND FLOAT_TYPES "ZCOMPLEX") # defines COMPLEX and DOUBLE
|
||||
endif ()
|
||||
|
||||
if (BUILD_HALF)
|
||||
if (BUILD_BFLOAT16)
|
||||
message(STATUS "Building Half Precision")
|
||||
list(APPEND FLOAT_TYPES "HALF")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_HALF")
|
||||
list(APPEND FLOAT_TYPES "BFLOAT16") # defines nothing
|
||||
endif ()
|
||||
|
||||
if (NOT DEFINED CORE OR "${CORE}" STREQUAL "UNKNOWN")
|
||||
|
@ -243,6 +236,9 @@ if (NOT MSVC AND NOT NOFORTRAN)
|
|||
add_subdirectory(ctest)
|
||||
endif()
|
||||
add_subdirectory(lapack-netlib/TESTING)
|
||||
if (CPP_THREAD_SAFETY_TEST OR CPP_THREAD_SAFETY_GEMV)
|
||||
add_subdirectory(cpp_thread_test)
|
||||
endif()
|
||||
endif()
|
||||
|
||||
set_target_properties(${OpenBLAS_LIBNAME} PROPERTIES
|
||||
|
|
|
@ -272,17 +272,33 @@ COMMON_PROF = -pg
|
|||
# work at all.
|
||||
#
|
||||
# CPP_THREAD_SAFETY_TEST = 1
|
||||
#
|
||||
# use this to run only the less memory-hungry GEMV test
|
||||
# CPP_THREAD_SAFETY_GEMV = 1
|
||||
|
||||
|
||||
# If you want to enable the experimental BFLOAT16 support
|
||||
# BUILD_HALF = 1
|
||||
#
|
||||
# Select if you need to build only select types
|
||||
# BUILD_SINGLE = 1
|
||||
# BUILD_DOUBLE = 1
|
||||
# BUILD_COMPLEX = 1
|
||||
# BUILD_COMPLEX16 = 1
|
||||
#
|
||||
#
|
||||
# BUILD_BFLOAT16 = 1
|
||||
|
||||
|
||||
# Set the thread number threshold beyond which the job array for the threaded level3 BLAS
|
||||
# will be allocated on the heap rather than the stack. (This array alone requires
|
||||
# NUM_THREADS*NUM_THREADS*128 bytes of memory so should not pose a problem at low cpu
|
||||
# counts, but obviously it is not the only item that ends up on the stack.
|
||||
# The default value of 32 ensures that the overall requirement is compatible
|
||||
# with the default 1MB stacksize imposed by having the Java VM loaded without use
|
||||
# of its -Xss parameter.
|
||||
# The value of 160 formerly used from about version 0.2.7 until 0.3.10 is easily compatible
|
||||
# with the common Linux stacksize of 8MB but will cause crashes with unwary use of the java
|
||||
# VM e.g. in Octave or with the java-based libhdfs in numpy or scipy code
|
||||
# BLAS3_MEM_ALLOC_THRESHOLD = 160
|
||||
|
||||
|
||||
|
||||
# the below is not yet configurable, use cmake if you need to build only select types
|
||||
BUILD_SINGLE = 1
|
||||
BUILD_DOUBLE = 1
|
||||
BUILD_COMPLEX = 1
|
||||
BUILD_COMPLEX16 = 1
|
||||
# End of user configuration
|
||||
#
|
||||
|
|
|
@ -1232,8 +1232,8 @@ ifeq ($(USE_TLS), 1)
|
|||
CCOMMON_OPT += -DUSE_TLS
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
CCOMMON_OPT += -DBUILD_HALF
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
CCOMMON_OPT += -DBUILD_BFLOAT16
|
||||
endif
|
||||
ifeq ($(BUILD_SINGLE), 1)
|
||||
CCOMMON_OPT += -DBUILD_SINGLE=1
|
||||
|
@ -1521,10 +1521,10 @@ export KERNELDIR
|
|||
export FUNCTION_PROFILE
|
||||
export TARGET_CORE
|
||||
export NO_AVX512
|
||||
export BUILD_HALF
|
||||
export BUILD_BFLOAT16
|
||||
|
||||
export SHGEMM_UNROLL_M
|
||||
export SHGEMM_UNROLL_N
|
||||
export SBGEMM_UNROLL_M
|
||||
export SBGEMM_UNROLL_N
|
||||
export SGEMM_UNROLL_M
|
||||
export SGEMM_UNROLL_N
|
||||
export DGEMM_UNROLL_M
|
||||
|
|
|
@ -24,14 +24,14 @@ BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
|
|||
BLASOBJS_P += $(QBLASOBJS_P) $(XBLASOBJS_P)
|
||||
endif
|
||||
|
||||
$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX
|
||||
$(SHBLASOBJS) $(SHBLASOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
|
||||
$(SBLASOBJS) $(SBLASOBJS_P) : override CFLAGS += -UDOUBLE -UCOMPLEX
|
||||
$(DBLASOBJS) $(DBLASOBJS_P) : override CFLAGS += -DDOUBLE -UCOMPLEX
|
||||
$(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
|
||||
$(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX
|
||||
$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
|
||||
$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
|
||||
$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX
|
||||
$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DBFLOAT16 -UDOUBLE -UCOMPLEX
|
||||
|
||||
$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
|
||||
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
|
||||
|
|
|
@ -49,8 +49,8 @@ else
|
|||
GOTO_LAPACK_TARGETS=
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
GOTO_HALF_TARGETS=shgemm.goto
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
GOTO_HALF_TARGETS=sbgemm.goto
|
||||
else
|
||||
GOTO_HALF_TARGETS=
|
||||
endif
|
||||
|
@ -620,8 +620,8 @@ zcholesky.essl : zcholesky.$(SUFFIX)
|
|||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Sgemm ####################################################
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
sbgemm.goto : sbgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
endif
|
||||
|
||||
|
@ -2927,8 +2927,8 @@ ccholesky.$(SUFFIX) : cholesky.c
|
|||
zcholesky.$(SUFFIX) : cholesky.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
shgemm.$(SUFFIX) : gemm.c
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
sbgemm.$(SUFFIX) : gemm.c
|
||||
$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
endif
|
||||
|
||||
|
|
|
@ -40,7 +40,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifdef DOUBLE
|
||||
#define GEMM BLASFUNC(dgemm)
|
||||
#elif defined(HALF)
|
||||
#define GEMM BLASFUNC(shgemm)
|
||||
#define GEMM BLASFUNC(sbgemm)
|
||||
#else
|
||||
#define GEMM BLASFUNC(sgemm)
|
||||
#endif
|
||||
|
|
2
cblas.h
2
cblas.h
|
@ -392,7 +392,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
|
|||
/* convert BFLOAT16 array to double array */
|
||||
void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
|
||||
/* dot production of BFLOAT16 input arrays, and output as float */
|
||||
float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
|
||||
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
|
||||
|
||||
#ifdef __cplusplus
|
||||
}
|
||||
|
|
|
@ -113,7 +113,7 @@ macro(SetDefaultL1)
|
|||
set(ZSUMKERNEL zsum.S)
|
||||
set(QSUMKERNEL sum.S)
|
||||
set(XSUMKERNEL zsum.S)
|
||||
if (BUILD_HALF)
|
||||
if (BUILD_BFLOAT16)
|
||||
set(SHAMINKERNEL ../arm/amin.c)
|
||||
set(SHAMAXKERNEL ../arm/amax.c)
|
||||
set(SHMAXKERNEL ../arm/max.c)
|
||||
|
@ -126,7 +126,7 @@ if (BUILD_HALF)
|
|||
set(SHAXPYKERNEL ../arm/axpy.c)
|
||||
set(SHAXPBYKERNEL ../arm/axpby.c)
|
||||
set(SHCOPYKERNEL ../arm/copy.c)
|
||||
set(SHDOTKERNEL ../x86_64/shdot.c)
|
||||
set(SBDOTKERNEL ../x86_64/sbdot.c)
|
||||
set(SHROTKERNEL ../arm/rot.c)
|
||||
set(SHSCALKERNEL ../arm/scal.c)
|
||||
set(SHNRM2KERNEL ../arm/nrm2.c)
|
||||
|
@ -183,9 +183,9 @@ macro(SetDefaultL2)
|
|||
set(XHEMV_L_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
|
||||
set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
|
||||
if (BUILD_HALF)
|
||||
set(SHGEMVNKERNEL ../arm/gemv_n.c)
|
||||
set(SHGEMVTKERNEL ../arm/gemv_t.c)
|
||||
if (BUILD_BFLOAT16)
|
||||
set(SBGEMVNKERNEL ../arm/gemv_n.c)
|
||||
set(SBGEMVTKERNEL ../arm/gemv_t.c)
|
||||
set(SHGERKERNEL ../generic/ger.c)
|
||||
endif ()
|
||||
endmacro ()
|
||||
|
@ -195,18 +195,18 @@ macro(SetDefaultL3)
|
|||
set(DGEADD_KERNEL ../generic/geadd.c)
|
||||
set(CGEADD_KERNEL ../generic/zgeadd.c)
|
||||
set(ZGEADD_KERNEL ../generic/zgeadd.c)
|
||||
if (BUILD_HALF)
|
||||
if (BUILD_BFLOAT16)
|
||||
set(SHGEADD_KERNEL ../generic/geadd.c)
|
||||
set(SHGEMMKERNEL ../generic/gemmkernel_2x2.c)
|
||||
set(SHGEMM_BETA ../generic/gemm_beta.c)
|
||||
set(SHGEMMINCOPY ../generic/gemm_ncopy_2.c)
|
||||
set(SHGEMMITCOPY ../generic/gemm_tcopy_2.c)
|
||||
set(SHGEMMONCOPY ../generic/gemm_ncopy_2.c)
|
||||
set(SHGEMMOTCOPY ../generic/gemm_tcopy_2.c)
|
||||
set(SHGEMMINCOPYOBJ shgemm_incopy.o)
|
||||
set(SHGEMMITCOPYOBJ shgemm_itcopy.o)
|
||||
set(SHGEMMONCOPYOBJ shgemm_oncopy.o)
|
||||
set(SHGEMMOTCOPYOBJ shgemm_otcopy.o)
|
||||
set(SBGEMMKERNEL ../generic/gemmkernel_2x2.c)
|
||||
set(SBGEMM_BETA ../generic/gemm_beta.c)
|
||||
set(SBGEMMINCOPY ../generic/gemm_ncopy_2.c)
|
||||
set(SBGEMMITCOPY ../generic/gemm_tcopy_2.c)
|
||||
set(SBGEMMONCOPY ../generic/gemm_ncopy_2.c)
|
||||
set(SBGEMMOTCOPY ../generic/gemm_tcopy_2.c)
|
||||
set(SBGEMMINCOPYOBJ sbgemm_incopy.o)
|
||||
set(SBGEMMITCOPYOBJ sbgemm_itcopy.o)
|
||||
set(SBGEMMONCOPYOBJ sbgemm_oncopy.o)
|
||||
set(SBGEMMOTCOPYOBJ sbgemm_otcopy.o)
|
||||
endif ()
|
||||
|
||||
endmacro ()
|
||||
|
|
|
@ -16,8 +16,8 @@
|
|||
# HAVE_SSE2
|
||||
# HAVE_SSE3
|
||||
# MAKE
|
||||
# SHGEMM_UNROLL_M
|
||||
# SHGEMM_UNROLL_N
|
||||
# SBGEMM_UNROLL_M
|
||||
# SBGEMM_UNROLL_N
|
||||
# SGEMM_UNROLL_M
|
||||
# SGEMM_UNROLL_N
|
||||
# DGEMM_UNROLL_M
|
||||
|
@ -471,8 +471,8 @@ endif ()
|
|||
set(ZGEMM_UNROLL_N 2)
|
||||
set(SYMV_P 8)
|
||||
endif()
|
||||
set(SHGEMM_UNROLL_M 8)
|
||||
set(SHGEMM_UNROLL_N 4)
|
||||
set(SBGEMM_UNROLL_M 8)
|
||||
set(SBGEMM_UNROLL_N 4)
|
||||
|
||||
# Or should this actually be NUM_CORES?
|
||||
if (${NUM_THREADS} GREATER 0)
|
||||
|
|
|
@ -326,7 +326,13 @@ else ()
|
|||
set(CCOMMON_OPT "${CCOMMON_OPT} -DMAX_STACK_ALLOC=2048")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
if (NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Windows")
|
||||
if (DEFINED BLAS3_MEM_ALLOC_THRESHOLD)
|
||||
if (NOT ${BLAS3_MEM_ALLOC_THRESHOLD} EQUAL 32)
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DBLAS3_MEM_ALLOC_THRESHOLD=${BLAS3_MEM_ALLOC_THRESHOLD}")
|
||||
endif()
|
||||
endif()
|
||||
endif()
|
||||
if (DEFINED LIBNAMESUFFIX)
|
||||
set(LIBPREFIX "libopenblas_${LIBNAMESUFFIX}")
|
||||
else ()
|
||||
|
@ -404,20 +410,16 @@ if (NOT BUILD_SINGLE AND NOT BUILD_DOUBLE AND NOT BUILD_COMPLEX AND NOT BUILD_CO
|
|||
set (BUILD_COMPLEX16 ON)
|
||||
endif()
|
||||
if (BUILD_SINGLE)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE=1")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE")
|
||||
endif()
|
||||
if (BUILD_DOUBLE)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE=1")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_SINGLE=1")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE")
|
||||
endif()
|
||||
if (BUILD_COMPLEX)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX=1")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX=1")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX")
|
||||
endif()
|
||||
if (BUILD_COMPLEX16)
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16=1")
|
||||
set(CCOMMON_OPT "${CCOMMON_OPT} -DBUILD_COMPLEX16=1")
|
||||
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
|
||||
endif()
|
||||
if(NOT MSVC)
|
||||
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
|
||||
|
@ -591,8 +593,8 @@ endif ()
|
|||
#export FUNCTION_PROFILE
|
||||
#export TARGET_CORE
|
||||
#
|
||||
#export SHGEMM_UNROLL_M
|
||||
#export SHGEMM_UNROLL_N
|
||||
#export SBGEMM_UNROLL_M
|
||||
#export SBGEMM_UNROLL_N
|
||||
#export SGEMM_UNROLL_M
|
||||
#export SGEMM_UNROLL_N
|
||||
#export DGEMM_UNROLL_M
|
||||
|
|
|
@ -211,7 +211,7 @@ function(GenerateNamedObjects sources_in)
|
|||
if (complex_only)
|
||||
list(REMOVE_ITEM float_list "SINGLE")
|
||||
list(REMOVE_ITEM float_list "DOUBLE")
|
||||
list(REMOVE_ITEM float_list "HALF")
|
||||
list(REMOVE_ITEM float_list "BFLOAT16")
|
||||
elseif (real_only)
|
||||
list(REMOVE_ITEM float_list "COMPLEX")
|
||||
list(REMOVE_ITEM float_list "ZCOMPLEX")
|
||||
|
@ -225,8 +225,8 @@ function(GenerateNamedObjects sources_in)
|
|||
if (NOT no_float_type)
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
string(TOLOWER ${float_char} float_char)
|
||||
if (${float_type} STREQUAL "HALF")
|
||||
set (float_char "sh")
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "sb")
|
||||
endif ()
|
||||
endif ()
|
||||
|
||||
|
@ -262,8 +262,8 @@ function(GenerateNamedObjects sources_in)
|
|||
if (${float_type} STREQUAL "DOUBLE" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
list(APPEND obj_defines "DOUBLE")
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "HALF")
|
||||
list(APPEND obj_defines "HALF")
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
list(APPEND obj_defines "BFLOAT16")
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
list(APPEND obj_defines "COMPLEX")
|
||||
|
|
4
common.h
4
common.h
|
@ -260,7 +260,7 @@ typedef unsigned long BLASULONG;
|
|||
#ifndef BFLOAT16
|
||||
#include <stdint.h>
|
||||
typedef uint16_t bfloat16;
|
||||
#define HALFCONVERSION 1
|
||||
#define BFLOAT16CONVERSION 1
|
||||
#endif
|
||||
|
||||
#ifdef USE64BITINT
|
||||
|
@ -303,7 +303,7 @@ typedef int blasint;
|
|||
#define SIZE 8
|
||||
#define BASE_SHIFT 3
|
||||
#define ZBASE_SHIFT 4
|
||||
#elif defined(HALF)
|
||||
#elif defined(BFLOAT16)
|
||||
#define IFLOAT bfloat16
|
||||
#define XFLOAT IFLOAT
|
||||
#define FLOAT float
|
||||
|
|
|
@ -54,7 +54,7 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *);
|
|||
double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *);
|
||||
xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
|
||||
|
||||
float BLASFUNC(shdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *);
|
||||
float BLASFUNC(sbdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *);
|
||||
void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *);
|
||||
void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *);
|
||||
void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *);
|
||||
|
@ -474,7 +474,7 @@ void BLASFUNC(xhbmv)(char *, blasint *, blasint *, xdouble *, xdouble *, blasint
|
|||
|
||||
/* Level 3 routines */
|
||||
|
||||
void BLASFUNC(shgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
|
||||
void BLASFUNC(sbgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
|
||||
bfloat16 *, blasint *, bfloat16 *, blasint *, float *, float *, blasint *);
|
||||
void BLASFUNC(sgemm)(char *, char *, blasint *, blasint *, blasint *, float *,
|
||||
float *, blasint *, float *, blasint *, float *, float *, blasint *);
|
||||
|
|
|
@ -46,7 +46,7 @@ float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
|||
double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
float shdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
float sbdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
|
||||
void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
|
|
|
@ -55,7 +55,7 @@ void sgemm_direct(BLASLONG M, BLASLONG N, BLASLONG K,
|
|||
int sgemm_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K);
|
||||
|
||||
|
||||
int shgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
int sbgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
|
||||
int sgemm_beta(BLASLONG, BLASLONG, BLASLONG, float,
|
||||
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
@ -78,10 +78,10 @@ int xgemm_beta(BLASLONG, BLASLONG, BLASLONG, xdouble *,
|
|||
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
|
||||
#endif
|
||||
|
||||
int shgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||
int shgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||
int shgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||
int shgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||
int sbgemm_incopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||
int sbgemm_itcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||
int sbgemm_oncopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||
int sbgemm_otcopy(BLASLONG m, BLASLONG n, bfloat16 *a, BLASLONG lda, bfloat16 *b);
|
||||
int sgemm_incopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
|
||||
int sgemm_itcopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
|
||||
int sgemm_oncopy(BLASLONG m, BLASLONG n, float *a, BLASLONG lda, float *b);
|
||||
|
@ -505,7 +505,7 @@ int xher2k_kernel_UC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdoubl
|
|||
int xher2k_kernel_LN(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
|
||||
int xher2k_kernel_LC(BLASLONG m, BLASLONG n, BLASLONG k, xdouble alpha_r, xdouble alpha_i, xdouble *a, xdouble *b, xdouble *c, BLASLONG ldc, BLASLONG offset, int flag);
|
||||
|
||||
int shgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
|
||||
int sbgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
|
||||
int sgemm_kernel(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG);
|
||||
int dgemm_kernel(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
|
||||
|
||||
|
@ -534,10 +534,10 @@ int cgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float
|
|||
int zgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, double, double, double *, double *, double *, BLASLONG);
|
||||
int xgemm3m_kernel(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, xdouble *, xdouble *, xdouble *, BLASLONG);
|
||||
|
||||
int shgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int shgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int shgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int shgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int sbgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int sbgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int sbgemm_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int sbgemm_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
|
||||
int sgemm_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
int sgemm_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
|
@ -631,10 +631,10 @@ int xgemm_cr(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLON
|
|||
int xgemm_cc(blas_arg_t *, BLASLONG *, BLASLONG *, xdouble *, xdouble *, BLASLONG);
|
||||
#endif
|
||||
|
||||
int shgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int shgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int shgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int shgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int sbgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int sbgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int sbgemm_thread_tn(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
int sbgemm_thread_tt(blas_arg_t *, BLASLONG *, BLASLONG *, bfloat16 *, bfloat16 *, BLASLONG);
|
||||
|
||||
int sgemm_thread_nn(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
int sgemm_thread_nt(blas_arg_t *, BLASLONG *, BLASLONG *, float *, float *, BLASLONG);
|
||||
|
|
|
@ -39,7 +39,7 @@
|
|||
#ifndef COMMON_MACRO
|
||||
#define COMMON_MACRO
|
||||
|
||||
#include "common_sh.h"
|
||||
#include "common_sb.h"
|
||||
#include "common_s.h"
|
||||
#include "common_d.h"
|
||||
#include "common_q.h"
|
||||
|
@ -644,7 +644,7 @@
|
|||
|
||||
#define GEADD_K DGEADD_K
|
||||
|
||||
#elif defined(HALF)
|
||||
#elif defined(BFLOAT16)
|
||||
|
||||
#define D_TO_BF16_K SHDTOBF16_K
|
||||
#define D_BF16_TO_K DBF16TOD_K
|
||||
|
@ -662,7 +662,7 @@
|
|||
#define ASUM_K SASUM_K
|
||||
#define DOTU_K SDOTU_K
|
||||
#define DOTC_K SDOTC_K
|
||||
#define BF16_DOT_K SHDOT_K
|
||||
#define BF16_DOT_K SBDOT_K
|
||||
#define AXPYU_K SAXPYU_K
|
||||
#define AXPYC_K SAXPYC_K
|
||||
#define AXPBY_K SAXPBY_K
|
||||
|
@ -682,32 +682,32 @@
|
|||
#define NRM2_K SNRM2_K
|
||||
#define SYMV_THREAD_U SSYMV_THREAD_U
|
||||
#define SYMV_THREAD_L SSYMV_THREAD_L
|
||||
#define GEMM_BETA SHGEMM_BETA
|
||||
#define GEMM_KERNEL_N SHGEMM_KERNEL
|
||||
#define GEMM_KERNEL_L SHGEMM_KERNEL
|
||||
#define GEMM_KERNEL_R SHGEMM_KERNEL
|
||||
#define GEMM_KERNEL_B SHGEMM_KERNEL
|
||||
#define GEMM_BETA SBGEMM_BETA
|
||||
#define GEMM_KERNEL_N SBGEMM_KERNEL
|
||||
#define GEMM_KERNEL_L SBGEMM_KERNEL
|
||||
#define GEMM_KERNEL_R SBGEMM_KERNEL
|
||||
#define GEMM_KERNEL_B SBGEMM_KERNEL
|
||||
|
||||
#define GEMM_NN SHGEMM_NN
|
||||
#define GEMM_CN SHGEMM_TN
|
||||
#define GEMM_TN SHGEMM_TN
|
||||
#define GEMM_NC SHGEMM_NT
|
||||
#define GEMM_NT SHGEMM_NT
|
||||
#define GEMM_CC SHGEMM_TT
|
||||
#define GEMM_CT SHGEMM_TT
|
||||
#define GEMM_TC SHGEMM_TT
|
||||
#define GEMM_TT SHGEMM_TT
|
||||
#define GEMM_NR SHGEMM_NN
|
||||
#define GEMM_TR SHGEMM_TN
|
||||
#define GEMM_CR SHGEMM_TN
|
||||
#define GEMM_RN SHGEMM_NN
|
||||
#define GEMM_RT SHGEMM_NT
|
||||
#define GEMM_RC SHGEMM_NT
|
||||
#define GEMM_RR SHGEMM_NN
|
||||
#define GEMM_ONCOPY SHGEMM_ONCOPY
|
||||
#define GEMM_OTCOPY SHGEMM_OTCOPY
|
||||
#define GEMM_INCOPY SHGEMM_INCOPY
|
||||
#define GEMM_ITCOPY SHGEMM_ITCOPY
|
||||
#define GEMM_NN SBGEMM_NN
|
||||
#define GEMM_CN SBGEMM_TN
|
||||
#define GEMM_TN SBGEMM_TN
|
||||
#define GEMM_NC SBGEMM_NT
|
||||
#define GEMM_NT SBGEMM_NT
|
||||
#define GEMM_CC SBGEMM_TT
|
||||
#define GEMM_CT SBGEMM_TT
|
||||
#define GEMM_TC SBGEMM_TT
|
||||
#define GEMM_TT SBGEMM_TT
|
||||
#define GEMM_NR SBGEMM_NN
|
||||
#define GEMM_TR SBGEMM_TN
|
||||
#define GEMM_CR SBGEMM_TN
|
||||
#define GEMM_RN SBGEMM_NN
|
||||
#define GEMM_RT SBGEMM_NT
|
||||
#define GEMM_RC SBGEMM_NT
|
||||
#define GEMM_RR SBGEMM_NN
|
||||
#define GEMM_ONCOPY SBGEMM_ONCOPY
|
||||
#define GEMM_OTCOPY SBGEMM_OTCOPY
|
||||
#define GEMM_INCOPY SBGEMM_INCOPY
|
||||
#define GEMM_ITCOPY SBGEMM_ITCOPY
|
||||
#define SYMM_THREAD_LU SSYMM_THREAD_LU
|
||||
#define SYMM_THREAD_LL SSYMM_THREAD_LL
|
||||
#define SYMM_THREAD_RU SSYMM_THREAD_RU
|
||||
|
@ -723,22 +723,22 @@
|
|||
#define HEMM_THREAD_RU SHEMM_THREAD_RU
|
||||
#define HEMM_THREAD_RL SHEMM_THREAD_RL
|
||||
|
||||
#define GEMM_THREAD_NN SHGEMM_THREAD_NN
|
||||
#define GEMM_THREAD_CN SHGEMM_THREAD_TN
|
||||
#define GEMM_THREAD_TN SHGEMM_THREAD_TN
|
||||
#define GEMM_THREAD_NC SHGEMM_THREAD_NT
|
||||
#define GEMM_THREAD_NT SHGEMM_THREAD_NT
|
||||
#define GEMM_THREAD_CC SHGEMM_THREAD_TT
|
||||
#define GEMM_THREAD_CT SHGEMM_THREAD_TT
|
||||
#define GEMM_THREAD_TC SHGEMM_THREAD_TT
|
||||
#define GEMM_THREAD_TT SHGEMM_THREAD_TT
|
||||
#define GEMM_THREAD_NR SHGEMM_THREAD_NN
|
||||
#define GEMM_THREAD_TR SHGEMM_THREAD_TN
|
||||
#define GEMM_THREAD_CR SHGEMM_THREAD_TN
|
||||
#define GEMM_THREAD_RN SHGEMM_THREAD_NN
|
||||
#define GEMM_THREAD_RT SHGEMM_THREAD_NT
|
||||
#define GEMM_THREAD_RC SHGEMM_THREAD_NT
|
||||
#define GEMM_THREAD_RR SHGEMM_THREAD_NN
|
||||
#define GEMM_THREAD_NN SBGEMM_THREAD_NN
|
||||
#define GEMM_THREAD_CN SBGEMM_THREAD_TN
|
||||
#define GEMM_THREAD_TN SBGEMM_THREAD_TN
|
||||
#define GEMM_THREAD_NC SBGEMM_THREAD_NT
|
||||
#define GEMM_THREAD_NT SBGEMM_THREAD_NT
|
||||
#define GEMM_THREAD_CC SBGEMM_THREAD_TT
|
||||
#define GEMM_THREAD_CT SBGEMM_THREAD_TT
|
||||
#define GEMM_THREAD_TC SBGEMM_THREAD_TT
|
||||
#define GEMM_THREAD_TT SBGEMM_THREAD_TT
|
||||
#define GEMM_THREAD_NR SBGEMM_THREAD_NN
|
||||
#define GEMM_THREAD_TR SBGEMM_THREAD_TN
|
||||
#define GEMM_THREAD_CR SBGEMM_THREAD_TN
|
||||
#define GEMM_THREAD_RN SBGEMM_THREAD_NN
|
||||
#define GEMM_THREAD_RT SBGEMM_THREAD_NT
|
||||
#define GEMM_THREAD_RC SBGEMM_THREAD_NT
|
||||
#define GEMM_THREAD_RR SBGEMM_THREAD_NN
|
||||
|
||||
#ifdef UNIT
|
||||
|
||||
|
@ -2491,9 +2491,9 @@
|
|||
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
|
||||
extern BLASLONG gemm_offset_a;
|
||||
extern BLASLONG gemm_offset_b;
|
||||
extern BLASLONG shgemm_p;
|
||||
extern BLASLONG shgemm_q;
|
||||
extern BLASLONG shgemm_r;
|
||||
extern BLASLONG sbgemm_p;
|
||||
extern BLASLONG sbgemm_q;
|
||||
extern BLASLONG sbgemm_r;
|
||||
extern BLASLONG sgemm_p;
|
||||
extern BLASLONG sgemm_q;
|
||||
extern BLASLONG sgemm_r;
|
||||
|
|
364
common_param.h
364
common_param.h
|
@ -47,114 +47,114 @@ typedef struct {
|
|||
int dtb_entries;
|
||||
int offsetA, offsetB, align;
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
int shgemm_p, shgemm_q, shgemm_r;
|
||||
int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn;
|
||||
#ifdef BUILD_BFLOAT16
|
||||
int sbgemm_p, sbgemm_q, sbgemm_r;
|
||||
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
|
||||
|
||||
void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
void (*sbf16tos_k) (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
|
||||
void (*dbf16tod_k) (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG);
|
||||
|
||||
float (*shamax_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*shamin_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*shmax_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*shmin_k) (BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*ishamax_k)(BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*ishamin_k)(BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*ishmax_k) (BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sbamax_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sbamin_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sbmax_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sbmin_k) (BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*isbamax_k)(BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*isbamin_k)(BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*isbmax_k) (BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
|
||||
|
||||
float (*shnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*shasum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*shsum_k) (BLASLONG, float *, BLASLONG);
|
||||
int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
float (*shdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
float (*sbnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sbasum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sbsum_k) (BLASLONG, float *, BLASLONG);
|
||||
int (*sbcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
float (*sbdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
|
||||
double (*dsbdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
||||
int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
|
||||
int (*sbrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
|
||||
|
||||
int (*shaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int (*shscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int (*shswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int (*sbaxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
||||
int (*shgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*shgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*shger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
|
||||
int (*shsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*shsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sbsymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
|
||||
int (*shgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
|
||||
int (*shgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
|
||||
int (*sbgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, bfloat16 *, float *, BLASLONG);
|
||||
int (*sbgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
|
||||
|
||||
int (*shgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||
int (*shgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||
int (*shgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||
int (*shgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||
int (*sbgemm_incopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||
int (*sbgemm_itcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||
int (*sbgemm_oncopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||
int (*sbgemm_otcopy )(BLASLONG, BLASLONG, bfloat16 *, BLASLONG, bfloat16 *);
|
||||
|
||||
int (*shtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*shtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*shtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*shtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*sbtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*sbtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*sbtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*sbtrsm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
||||
int (*shtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
|
||||
int (*shtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*shtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*shtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*shtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*sbtrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*sbtrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*sbtrmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*sbtrmm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
||||
int (*shtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_iunucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_iunncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_iutucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_iutncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_ilnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_ilnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_iltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_iltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_ounucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_ounncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_outucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_outncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_olnucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbtrmm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
|
||||
int (*shsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*shsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbsymm_iutcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbsymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbsymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*sbsymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
|
||||
int (*shneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*shlaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
|
||||
int (*sbneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sblaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
|
||||
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16)
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
int sgemm_p, sgemm_q, sgemm_r;
|
||||
int sgemm_unroll_m, sgemm_unroll_n, sgemm_unroll_mn;
|
||||
#endif
|
||||
|
||||
int exclusive_cache;
|
||||
|
||||
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX)
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
float (*samax_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*samin_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*smax_k) (BLASLONG, float *, BLASLONG);
|
||||
|
@ -167,11 +167,10 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
float (*snrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*sasum_k) (BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_SINGLE
|
||||
#ifdef BUILD_SINGLE
|
||||
float (*ssum_k) (BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX)
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
int (*scopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
float (*sdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
//double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
@ -179,26 +178,20 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*srot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
|
||||
|
||||
int (*saxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX) || (BUILD_COMPLEX16)
|
||||
int (*sscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX)
|
||||
int (*sswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
||||
int (*sgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
|
||||
#if BUILD_SINGLE
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*sger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
|
||||
int (*ssymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*ssymv_U) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
|
||||
|
||||
#if (BUILD_SINGLE) || (BUILD_DOUBLE) || (BUILD_COMPLEX)
|
||||
#if defined(BUILD_SINGLE) || defined(BUILD_COMPLEX)
|
||||
#ifdef ARCH_X86_64
|
||||
void (*sgemm_direct) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG , float *, BLASLONG , float * , BLASLONG);
|
||||
int (*sgemm_direct_performant) (BLASLONG M, BLASLONG N, BLASLONG K);
|
||||
|
@ -213,8 +206,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*sgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*sgemm_otcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE) || (BUILD_DOUBLE)
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*strsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
@ -236,8 +228,7 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*strsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*strsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*strsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
#endif
|
||||
#if BUILD_SINGLE
|
||||
|
||||
int (*strmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*strmm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
@ -264,18 +255,17 @@ BLASLONG (*ismin_k) (BLASLONG, float *, BLASLONG);
|
|||
int (*ssymm_iltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*ssymm_outcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
int (*ssymm_oltcopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float *);
|
||||
#endif
|
||||
#if (BUILD_SINGLE) || (BUILD_DOUBLE)
|
||||
|
||||
int (*sneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*slaswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
int dgemm_p, dgemm_q, dgemm_r;
|
||||
int dgemm_unroll_m, dgemm_unroll_n, dgemm_unroll_mn;
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
double (*damax_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*damin_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dmax_k) (BLASLONG, double *, BLASLONG);
|
||||
|
@ -286,21 +276,21 @@ BLASLONG (*idmax_k) (BLASLONG, double *, BLASLONG);
|
|||
BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
double (*dnrm2_k) (BLASLONG, double *, BLASLONG);
|
||||
double (*dasum_k) (BLASLONG, double *, BLASLONG);
|
||||
#endif
|
||||
#if BUILD_DOUBLE
|
||||
#ifdef BUILD_DOUBLE
|
||||
double (*dsum_k) (BLASLONG, double *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
int (*dcopy_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
double (*ddot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_SINGLE) || (BUILD_DOUBLE)
|
||||
#if defined (BUILD_SINGLE) || defined(BUILD_DOUBLE)
|
||||
double (*dsdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
int (*drot_k) (BLASLONG, double *, BLASLONG, double *, BLASLONG, double, double);
|
||||
int (*daxpy_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
int (*dscal_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
|
@ -308,15 +298,13 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
|||
int (*dgemv_n) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
int (*dgemv_t) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
#endif
|
||||
|
||||
#if BUILD_DOUBLE
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*dger_k) (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
|
||||
int (*dsymv_L) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
int (*dsymv_U) (BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE) || (BUILD_COMPLEX16)
|
||||
#if defined(BUILD_DOUBLE) || defined(BUILD_COMPLEX16)
|
||||
int (*dgemm_kernel )(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG);
|
||||
int (*dgemm_beta )(BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG);
|
||||
|
||||
|
@ -325,8 +313,7 @@ BLASLONG (*idmin_k) (BLASLONG, double *, BLASLONG);
|
|||
int (*dgemm_oncopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
int (*dgemm_otcopy )(BLASLONG, BLASLONG, double *, BLASLONG, double *);
|
||||
#endif
|
||||
|
||||
#if BUILD_DOUBLE
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*dtrsm_kernel_LN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*dtrsm_kernel_LT)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
int (*dtrsm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, double, double *, double *, double *, BLASLONG, BLASLONG);
|
||||
|
@ -473,30 +460,23 @@ BLASLONG (*iqmin_k) (BLASLONG, xdouble *, BLASLONG);
|
|||
|
||||
#endif
|
||||
|
||||
|
||||
#if (BUILD_COMPLEX) || (BUILD_COMPLEX16)
|
||||
#ifdef BUILD_COMPLEX
|
||||
int cgemm_p, cgemm_q, cgemm_r;
|
||||
int cgemm_unroll_m, cgemm_unroll_n, cgemm_unroll_mn;
|
||||
|
||||
float (*camax_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*camin_k) (BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*icamax_k)(BLASLONG, float *, BLASLONG);
|
||||
BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if BUILD_COMPLEX
|
||||
|
||||
float (*cnrm2_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*casum_k) (BLASLONG, float *, BLASLONG);
|
||||
float (*csum_k) (BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16)
|
||||
int (*ccopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotu_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
openblas_complex_float (*cdotc_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
#endif
|
||||
#if BUILD_COMPLEX
|
||||
int (*csrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX)|| (BUILD_COMPLEX16)
|
||||
|
||||
int (*caxpy_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int (*caxpyc_k)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
int (*cscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
@ -510,8 +490,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
|||
int (*cgemv_u) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*cgemv_s) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*cgemv_d) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX)
|
||||
int (*cgeru_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*cgerc_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*cgerv_k) (BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
|
@ -523,14 +501,13 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
|||
int (*chemv_U) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*chemv_M) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*chemv_V) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX) || (BUILD_COMPLEX16)
|
||||
|
||||
int (*cgemm_kernel_n )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
int (*cgemm_kernel_l )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
int (*cgemm_kernel_r )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
int (*cgemm_kernel_b )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG);
|
||||
int (*cgemm_beta )(BLASLONG, BLASLONG, BLASLONG, float, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
|
||||
|
||||
int (*cgemm_incopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*cgemm_itcopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*cgemm_oncopy )(BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
|
@ -561,8 +538,6 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
|||
int (*ctrsm_olnncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*ctrsm_oltucopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
int (*ctrsm_oltncopy)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, float *);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX)
|
||||
|
||||
int (*ctrmm_kernel_RN)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
int (*ctrmm_kernel_RT)(BLASLONG, BLASLONG, BLASLONG, float, float, float *, float *, float *, BLASLONG, BLASLONG);
|
||||
|
@ -646,14 +621,12 @@ BLASLONG (*icamin_k)(BLASLONG, float *, BLASLONG);
|
|||
int (*chemm3m_olcopyr)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
|
||||
int (*chemm3m_oucopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
|
||||
int (*chemm3m_olcopyi)(BLASLONG, BLASLONG, float *, BLASLONG, BLASLONG, BLASLONG, float, float, float *);
|
||||
#endif
|
||||
#if (BUILD_COMPLEX) || (BUILD_COMPLEX16)
|
||||
|
||||
int (*cneg_tcopy) (BLASLONG, BLASLONG, float *, BLASLONG, float *);
|
||||
int (*claswp_ncopy) (BLASLONG, BLASLONG, BLASLONG, float *, BLASLONG, blasint *, float *);
|
||||
#endif
|
||||
|
||||
|
||||
#if BUILD_COMPLEX16
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int zgemm_p, zgemm_q, zgemm_r;
|
||||
int zgemm_unroll_m, zgemm_unroll_n, zgemm_unroll_mn;
|
||||
|
||||
|
@ -991,35 +964,34 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
void (*init)(void);
|
||||
|
||||
int snum_opt, dnum_opt, qnum_opt;
|
||||
|
||||
#if BUILD_SINGLE
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*saxpby_k) (BLASLONG, float, float*, BLASLONG,float, float*, BLASLONG);
|
||||
#endif
|
||||
#if BUILD_DOUBLE
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*daxpby_k) (BLASLONG, double, double*, BLASLONG,double, double*, BLASLONG);
|
||||
#endif
|
||||
#if BUILD_COMPLEX
|
||||
#ifdef BUILD_COMPLEX
|
||||
int (*caxpby_k) (BLASLONG, float, float, float*, BLASLONG,float,float, float*, BLASLONG);
|
||||
#endif
|
||||
#if BUILD_COMPLEX16
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int (*zaxpby_k) (BLASLONG, double, double, double*, BLASLONG,double,double, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_SINGLE
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*somatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*somatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*somatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*somatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG, float*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_DOUBLE
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*domatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*domatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*domatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*domatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_COMPLEX
|
||||
#ifdef BUILD_COMPLEX
|
||||
int (*comatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*comatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
|
||||
int (*comatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
|
||||
|
@ -1031,7 +1003,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
int (*comatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG, float*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_COMPLEX16
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int (*zomatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*zomatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
int (*zomatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
|
@ -1043,21 +1015,21 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
int (*zomatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_SINGLE
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*simatcopy_k_cn) (BLASLONG, BLASLONG, float, float*, BLASLONG);
|
||||
int (*simatcopy_k_ct) (BLASLONG, BLASLONG, float, float*, BLASLONG);
|
||||
int (*simatcopy_k_rn) (BLASLONG, BLASLONG, float, float*, BLASLONG);
|
||||
int (*simatcopy_k_rt) (BLASLONG, BLASLONG, float, float*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_DOUBLE
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*dimatcopy_k_cn) (BLASLONG, BLASLONG, double, double*, BLASLONG);
|
||||
int (*dimatcopy_k_ct) (BLASLONG, BLASLONG, double, double*, BLASLONG);
|
||||
int (*dimatcopy_k_rn) (BLASLONG, BLASLONG, double, double*, BLASLONG);
|
||||
int (*dimatcopy_k_rt) (BLASLONG, BLASLONG, double, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_COMPLEX
|
||||
#ifdef BUILD_COMPLEX
|
||||
int (*cimatcopy_k_cn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
|
||||
int (*cimatcopy_k_ct) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
|
||||
int (*cimatcopy_k_rn) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
|
||||
|
@ -1069,7 +1041,7 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
int (*cimatcopy_k_rtc) (BLASLONG, BLASLONG, float, float, float*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_COMPLEX16
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int (*zimatcopy_k_cn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
|
||||
int (*zimatcopy_k_ct) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
|
||||
int (*zimatcopy_k_rn) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
|
||||
|
@ -1081,16 +1053,16 @@ BLASLONG (*ixamin_k)(BLASLONG, xdouble *, BLASLONG);
|
|||
int (*zimatcopy_k_rtc) (BLASLONG, BLASLONG, double, double, double*, BLASLONG);
|
||||
#endif
|
||||
|
||||
#if BUILD_SINGLE
|
||||
#ifdef BUILD_SINGLE
|
||||
int (*sgeadd_k) (BLASLONG, BLASLONG, float, float *, BLASLONG, float, float *, BLASLONG);
|
||||
#endif
|
||||
#if BUILD_DOUBLE
|
||||
#ifdef BUILD_DOUBLE
|
||||
int (*dgeadd_k) (BLASLONG, BLASLONG, double, double *, BLASLONG, double, double *, BLASLONG);
|
||||
#endif
|
||||
#if BUILD_COMPLEX
|
||||
#ifdef BUILD_COMPLEX
|
||||
int (*cgeadd_k) (BLASLONG, BLASLONG, float, float, float *, BLASLONG, float, float, float *, BLASLONG);
|
||||
#endif
|
||||
#if BUILD_COMPLEX16
|
||||
#ifdef BUILD_COMPLEX16
|
||||
int (*zgeadd_k) (BLASLONG, BLASLONG, double, double, double *, BLASLONG, double, double, double *, BLASLONG);
|
||||
#endif
|
||||
} gotoblas_t;
|
||||
|
@ -1104,16 +1076,16 @@ extern gotoblas_t *gotoblas;
|
|||
|
||||
#define HAVE_EX_L2 gotoblas -> exclusive_cache
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
#define SHGEMM_P gotoblas -> shgemm_p
|
||||
#define SHGEMM_Q gotoblas -> shgemm_q
|
||||
#define SHGEMM_R gotoblas -> shgemm_r
|
||||
#define SHGEMM_UNROLL_M gotoblas -> shgemm_unroll_m
|
||||
#define SHGEMM_UNROLL_N gotoblas -> shgemm_unroll_n
|
||||
#define SHGEMM_UNROLL_MN gotoblas -> shgemm_unroll_mn
|
||||
#ifdef BUILD_BFLOAT16
|
||||
#define SBGEMM_P gotoblas -> sbgemm_p
|
||||
#define SBGEMM_Q gotoblas -> sbgemm_q
|
||||
#define SBGEMM_R gotoblas -> sbgemm_r
|
||||
#define SBGEMM_UNROLL_M gotoblas -> sbgemm_unroll_m
|
||||
#define SBGEMM_UNROLL_N gotoblas -> sbgemm_unroll_n
|
||||
#define SBGEMM_UNROLL_MN gotoblas -> sbgemm_unroll_mn
|
||||
#endif
|
||||
|
||||
#if (BUILD_SINGLE)
|
||||
#if defined (BUILD_SINGLE)
|
||||
#define SGEMM_P gotoblas -> sgemm_p
|
||||
#define SGEMM_Q gotoblas -> sgemm_q
|
||||
#define SGEMM_R gotoblas -> sgemm_r
|
||||
|
@ -1122,21 +1094,13 @@ extern gotoblas_t *gotoblas;
|
|||
#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn
|
||||
#endif
|
||||
|
||||
#if (BUILD_DOUBLE)
|
||||
#if defined (BUILD_DOUBLE)
|
||||
#define DGEMM_P gotoblas -> dgemm_p
|
||||
#define DGEMM_Q gotoblas -> dgemm_q
|
||||
#define DGEMM_R gotoblas -> dgemm_r
|
||||
#define DGEMM_UNROLL_M gotoblas -> dgemm_unroll_m
|
||||
#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n
|
||||
#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn
|
||||
#if ! (BUILD_SINGLE)
|
||||
#define SGEMM_P gotoblas -> sgemm_p
|
||||
#define SGEMM_Q gotoblas -> sgemm_q
|
||||
#define SGEMM_R gotoblas -> sgemm_r
|
||||
#define SGEMM_UNROLL_M gotoblas -> sgemm_unroll_m
|
||||
#define SGEMM_UNROLL_N gotoblas -> sgemm_unroll_n
|
||||
#define SGEMM_UNROLL_MN gotoblas -> sgemm_unroll_mn
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define QGEMM_P gotoblas -> qgemm_p
|
||||
|
@ -1146,7 +1110,7 @@ extern gotoblas_t *gotoblas;
|
|||
#define QGEMM_UNROLL_N gotoblas -> qgemm_unroll_n
|
||||
#define QGEMM_UNROLL_MN gotoblas -> qgemm_unroll_mn
|
||||
|
||||
#if BUILD_COMPLEX
|
||||
#ifdef BUILD_COMPLEX
|
||||
#define CGEMM_P gotoblas -> cgemm_p
|
||||
#define CGEMM_Q gotoblas -> cgemm_q
|
||||
#define CGEMM_R gotoblas -> cgemm_r
|
||||
|
@ -1163,7 +1127,7 @@ extern gotoblas_t *gotoblas;
|
|||
#endif
|
||||
#endif
|
||||
|
||||
#if BUILD_COMPLEX16
|
||||
#ifdef BUILD_COMPLEX16
|
||||
#define ZGEMM_P gotoblas -> zgemm_p
|
||||
#define ZGEMM_Q gotoblas -> zgemm_q
|
||||
#define ZGEMM_R gotoblas -> zgemm_r
|
||||
|
@ -1178,14 +1142,6 @@ extern gotoblas_t *gotoblas;
|
|||
#define DGEMM_UNROLL_N gotoblas -> dgemm_unroll_n
|
||||
#define DGEMM_UNROLL_MN gotoblas -> dgemm_unroll_mn
|
||||
#endif
|
||||
#ifndef BUILD_COMPLEX
|
||||
#define CGEMM_P gotoblas -> cgemm_p
|
||||
#define CGEMM_Q gotoblas -> cgemm_q
|
||||
#define CGEMM_R gotoblas -> cgemm_r
|
||||
#define CGEMM_UNROLL_M gotoblas -> cgemm_unroll_m
|
||||
#define CGEMM_UNROLL_N gotoblas -> cgemm_unroll_n
|
||||
#define CGEMM_UNROLL_MN gotoblas -> cgemm_unroll_mn
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define XGEMM_P gotoblas -> xgemm_p
|
||||
|
@ -1230,16 +1186,16 @@ extern gotoblas_t *gotoblas;
|
|||
#define HAVE_EX_L2 0
|
||||
#endif
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
#define SHGEMM_P SHGEMM_DEFAULT_P
|
||||
#define SHGEMM_Q SHGEMM_DEFAULT_Q
|
||||
#define SHGEMM_R SHGEMM_DEFAULT_R
|
||||
#define SHGEMM_UNROLL_M SHGEMM_DEFAULT_UNROLL_M
|
||||
#define SHGEMM_UNROLL_N SHGEMM_DEFAULT_UNROLL_N
|
||||
#ifdef SHGEMM_DEFAULT_UNROLL_MN
|
||||
#define SHGEMM_UNROLL_MN SHGEMM_DEFAULT_UNROLL_MN
|
||||
#ifdef BUILD_BFLOAT16
|
||||
#define SBGEMM_P SBGEMM_DEFAULT_P
|
||||
#define SBGEMM_Q SBGEMM_DEFAULT_Q
|
||||
#define SBGEMM_R SBGEMM_DEFAULT_R
|
||||
#define SBGEMM_UNROLL_M SBGEMM_DEFAULT_UNROLL_M
|
||||
#define SBGEMM_UNROLL_N SBGEMM_DEFAULT_UNROLL_N
|
||||
#ifdef SBGEMM_DEFAULT_UNROLL_MN
|
||||
#define SBGEMM_UNROLL_MN SBGEMM_DEFAULT_UNROLL_MN
|
||||
#else
|
||||
#define SHGEMM_UNROLL_MN MAX((SHGEMM_UNROLL_M), (SHGEMM_UNROLL_N))
|
||||
#define SBGEMM_UNROLL_MN MAX((SBGEMM_UNROLL_M), (SBGEMM_UNROLL_N))
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
@ -1354,7 +1310,7 @@ extern gotoblas_t *gotoblas;
|
|||
#endif
|
||||
|
||||
#ifndef COMPLEX
|
||||
#if (XDOUBLE)
|
||||
#if defined(XDOUBLE)
|
||||
#define GEMM_P QGEMM_P
|
||||
#define GEMM_Q QGEMM_Q
|
||||
#define GEMM_R QGEMM_R
|
||||
|
@ -1378,18 +1334,18 @@ extern gotoblas_t *gotoblas;
|
|||
#define GEMM_DEFAULT_R DGEMM_DEFAULT_R
|
||||
#define GEMM_DEFAULT_UNROLL_M DGEMM_DEFAULT_UNROLL_M
|
||||
#define GEMM_DEFAULT_UNROLL_N DGEMM_DEFAULT_UNROLL_N
|
||||
#elif (HALF)
|
||||
#define GEMM_P SHGEMM_P
|
||||
#define GEMM_Q SHGEMM_Q
|
||||
#define GEMM_R SHGEMM_R
|
||||
#define GEMM_UNROLL_M SHGEMM_UNROLL_M
|
||||
#define GEMM_UNROLL_N SHGEMM_UNROLL_N
|
||||
#define GEMM_UNROLL_MN SHGEMM_UNROLL_MN
|
||||
#define GEMM_DEFAULT_P SHGEMM_DEFAULT_P
|
||||
#define GEMM_DEFAULT_Q SHGEMM_DEFAULT_Q
|
||||
#define GEMM_DEFAULT_R SHGEMM_DEFAULT_R
|
||||
#define GEMM_DEFAULT_UNROLL_M SHGEMM_DEFAULT_UNROLL_M
|
||||
#define GEMM_DEFAULT_UNROLL_N SHGEMM_DEFAULT_UNROLL_N
|
||||
#elif defined(BFLOAT16)
|
||||
#define GEMM_P SBGEMM_P
|
||||
#define GEMM_Q SBGEMM_Q
|
||||
#define GEMM_R SBGEMM_R
|
||||
#define GEMM_UNROLL_M SBGEMM_UNROLL_M
|
||||
#define GEMM_UNROLL_N SBGEMM_UNROLL_N
|
||||
#define GEMM_UNROLL_MN SBGEMM_UNROLL_MN
|
||||
#define GEMM_DEFAULT_P SBGEMM_DEFAULT_P
|
||||
#define GEMM_DEFAULT_Q SBGEMM_DEFAULT_Q
|
||||
#define GEMM_DEFAULT_R SBGEMM_DEFAULT_R
|
||||
#define GEMM_DEFAULT_UNROLL_M SBGEMM_DEFAULT_UNROLL_M
|
||||
#define GEMM_DEFAULT_UNROLL_N SBGEMM_DEFAULT_UNROLL_N
|
||||
#else
|
||||
#define GEMM_P SGEMM_P
|
||||
#define GEMM_Q SGEMM_Q
|
||||
|
@ -1404,7 +1360,7 @@ extern gotoblas_t *gotoblas;
|
|||
#define GEMM_DEFAULT_UNROLL_N SGEMM_DEFAULT_UNROLL_N
|
||||
#endif
|
||||
#else
|
||||
#if (XDOUBLE)
|
||||
#if defined(XDOUBLE)
|
||||
#define GEMM_P XGEMM_P
|
||||
#define GEMM_Q XGEMM_Q
|
||||
#define GEMM_R XGEMM_R
|
||||
|
@ -1475,8 +1431,8 @@ extern gotoblas_t *gotoblas;
|
|||
#define GEMM_THREAD gemm_thread_n
|
||||
#endif
|
||||
|
||||
#ifndef SHGEMM_DEFAULT_R
|
||||
#define SHGEMM_DEFAULT_R (((BUFFER_SIZE - ((SHGEMM_DEFAULT_P * SHGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SHGEMM_DEFAULT_Q * 4) - 15) & ~15UL)
|
||||
#ifndef SBGEMM_DEFAULT_R
|
||||
#define SBGEMM_DEFAULT_R (((BUFFER_SIZE - ((SBGEMM_DEFAULT_P * SBGEMM_DEFAULT_Q * 4 + GEMM_DEFAULT_OFFSET_A + GEMM_DEFAULT_ALIGN) & ~GEMM_DEFAULT_ALIGN)) / (SBGEMM_DEFAULT_Q * 4) - 15) & ~15UL)
|
||||
#endif
|
||||
|
||||
#ifndef SGEMM_DEFAULT_R
|
||||
|
@ -1518,7 +1474,7 @@ extern gotoblas_t *gotoblas;
|
|||
#ifndef GEMM3M_P
|
||||
#ifdef XDOUBLE
|
||||
#define GEMM3M_P XGEMM3M_P
|
||||
#elif defined (DOUBLE)
|
||||
#elif defined(DOUBLE)
|
||||
#define GEMM3M_P ZGEMM3M_P
|
||||
#else
|
||||
#define GEMM3M_P CGEMM3M_P
|
||||
|
@ -1528,7 +1484,7 @@ extern gotoblas_t *gotoblas;
|
|||
#ifndef GEMM3M_Q
|
||||
#ifdef XDOUBLE
|
||||
#define GEMM3M_Q XGEMM3M_Q
|
||||
#elif defined (DOUBLE)
|
||||
#elif defined(DOUBLE)
|
||||
#define GEMM3M_Q ZGEMM3M_Q
|
||||
#else
|
||||
#define GEMM3M_Q CGEMM3M_Q
|
||||
|
@ -1538,7 +1494,7 @@ extern gotoblas_t *gotoblas;
|
|||
#ifndef GEMM3M_R
|
||||
#ifdef XDOUBLE
|
||||
#define GEMM3M_R XGEMM3M_R
|
||||
#elif defined (DOUBLE)
|
||||
#elif defined(DOUBLE)
|
||||
#define GEMM3M_R ZGEMM3M_R
|
||||
#else
|
||||
#define GEMM3M_R CGEMM3M_R
|
||||
|
|
|
@ -0,0 +1,77 @@
|
|||
#ifndef COMMON_SB_H
|
||||
#define COMMON_SB_H
|
||||
|
||||
#ifndef DYNAMIC_ARCH
|
||||
|
||||
#define SBDOT_K sbdot_k
|
||||
#define SBSTOBF16_K sbstobf16_k
|
||||
#define SBDTOBF16_K sbdtobf16_k
|
||||
#define SBF16TOS_K sbf16tos_k
|
||||
#define DBF16TOD_K dbf16tod_k
|
||||
|
||||
#define SBGEMM_ONCOPY sbgemm_oncopy
|
||||
#define SBGEMM_OTCOPY sbgemm_otcopy
|
||||
|
||||
#if SBGEMM_DEFAULT_UNROLL_M == SBGEMM_DEFAULT_UNROLL_N
|
||||
#define SBGEMM_INCOPY sbgemm_oncopy
|
||||
#define SBGEMM_ITCOPY sbgemm_otcopy
|
||||
#else
|
||||
#define SBGEMM_INCOPY sbgemm_incopy
|
||||
#define SBGEMM_ITCOPY sbgemm_itcopy
|
||||
#endif
|
||||
#define SBGEMM_BETA sbgemm_beta
|
||||
#define SBGEMM_KERNEL sbgemm_kernel
|
||||
|
||||
#else
|
||||
|
||||
#define SBDOT_K gotoblas -> sbdot_k
|
||||
#define SBSTOBF16_K gotoblas -> sbstobf16_k
|
||||
#define SBDTOBF16_K gotoblas -> sbdtobf16_k
|
||||
#define SBF16TOS_K gotoblas -> sbf16tos_k
|
||||
#define DBF16TOD_K gotoblas -> dbf16tod_k
|
||||
|
||||
#define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy
|
||||
#define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy
|
||||
#define SBGEMM_INCOPY gotoblas -> sbgemm_incopy
|
||||
#define SBGEMM_ITCOPY gotoblas -> sbgemm_itcopy
|
||||
#define SBGEMM_BETA gotoblas -> sbgemm_beta
|
||||
#define SBGEMM_KERNEL gotoblas -> sbgemm_kernel
|
||||
|
||||
#endif
|
||||
|
||||
#define SBGEMM_NN sbgemm_nn
|
||||
#define SBGEMM_CN sbgemm_tn
|
||||
#define SBGEMM_TN sbgemm_tn
|
||||
#define SBGEMM_NC sbgemm_nt
|
||||
#define SBGEMM_NT sbgemm_nt
|
||||
#define SBGEMM_CC sbgemm_tt
|
||||
#define SBGEMM_CT sbgemm_tt
|
||||
#define SBGEMM_TC sbgemm_tt
|
||||
#define SBGEMM_TT sbgemm_tt
|
||||
#define SBGEMM_NR sbgemm_nn
|
||||
#define SBGEMM_TR sbgemm_tn
|
||||
#define SBGEMM_CR sbgemm_tn
|
||||
#define SBGEMM_RN sbgemm_nn
|
||||
#define SBGEMM_RT sbgemm_nt
|
||||
#define SBGEMM_RC sbgemm_nt
|
||||
#define SBGEMM_RR sbgemm_nn
|
||||
|
||||
#define SBGEMM_THREAD_NN sbgemm_thread_nn
|
||||
#define SBGEMM_THREAD_CN sbgemm_thread_tn
|
||||
#define SBGEMM_THREAD_TN sbgemm_thread_tn
|
||||
#define SBGEMM_THREAD_NC sbgemm_thread_nt
|
||||
#define SBGEMM_THREAD_NT sbgemm_thread_nt
|
||||
#define SBGEMM_THREAD_CC sbgemm_thread_tt
|
||||
#define SBGEMM_THREAD_CT sbgemm_thread_tt
|
||||
#define SBGEMM_THREAD_TC sbgemm_thread_tt
|
||||
#define SBGEMM_THREAD_TT sbgemm_thread_tt
|
||||
#define SBGEMM_THREAD_NR sbgemm_thread_nn
|
||||
#define SBGEMM_THREAD_TR sbgemm_thread_tn
|
||||
#define SBGEMM_THREAD_CR sbgemm_thread_tn
|
||||
#define SBGEMM_THREAD_RN sbgemm_thread_nn
|
||||
#define SBGEMM_THREAD_RT sbgemm_thread_nt
|
||||
#define SBGEMM_THREAD_RC sbgemm_thread_nt
|
||||
#define SBGEMM_THREAD_RR sbgemm_thread_nn
|
||||
|
||||
#endif
|
||||
|
77
common_sh.h
77
common_sh.h
|
@ -1,77 +0,0 @@
|
|||
#ifndef COMMON_SH_H
|
||||
#define COMMON_SH_H
|
||||
|
||||
#ifndef DYNAMIC_ARCH
|
||||
|
||||
#define SHDOT_K shdot_k
|
||||
#define SHSTOBF16_K shstobf16_k
|
||||
#define SHDTOBF16_K shdtobf16_k
|
||||
#define SBF16TOS_K sbf16tos_k
|
||||
#define DBF16TOD_K dbf16tod_k
|
||||
|
||||
#define SHGEMM_ONCOPY shgemm_oncopy
|
||||
#define SHGEMM_OTCOPY shgemm_otcopy
|
||||
|
||||
#if SHGEMM_DEFAULT_UNROLL_M == SHGEMM_DEFAULT_UNROLL_N
|
||||
#define SHGEMM_INCOPY shgemm_oncopy
|
||||
#define SHGEMM_ITCOPY shgemm_otcopy
|
||||
#else
|
||||
#define SHGEMM_INCOPY shgemm_incopy
|
||||
#define SHGEMM_ITCOPY shgemm_itcopy
|
||||
#endif
|
||||
#define SHGEMM_BETA shgemm_beta
|
||||
#define SHGEMM_KERNEL shgemm_kernel
|
||||
|
||||
#else
|
||||
|
||||
#define SHDOT_K gotoblas -> shdot_k
|
||||
#define SHSTOBF16_K gotoblas -> shstobf16_k
|
||||
#define SHDTOBF16_K gotoblas -> shdtobf16_k
|
||||
#define SBF16TOS_K gotoblas -> sbf16tos_k
|
||||
#define DBF16TOD_K gotoblas -> dbf16tod_k
|
||||
|
||||
#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy
|
||||
#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy
|
||||
#define SHGEMM_INCOPY gotoblas -> shgemm_incopy
|
||||
#define SHGEMM_ITCOPY gotoblas -> shgemm_itcopy
|
||||
#define SHGEMM_BETA gotoblas -> shgemm_beta
|
||||
#define SHGEMM_KERNEL gotoblas -> shgemm_kernel
|
||||
|
||||
#endif
|
||||
|
||||
#define SHGEMM_NN shgemm_nn
|
||||
#define SHGEMM_CN shgemm_tn
|
||||
#define SHGEMM_TN shgemm_tn
|
||||
#define SHGEMM_NC shgemm_nt
|
||||
#define SHGEMM_NT shgemm_nt
|
||||
#define SHGEMM_CC shgemm_tt
|
||||
#define SHGEMM_CT shgemm_tt
|
||||
#define SHGEMM_TC shgemm_tt
|
||||
#define SHGEMM_TT shgemm_tt
|
||||
#define SHGEMM_NR shgemm_nn
|
||||
#define SHGEMM_TR shgemm_tn
|
||||
#define SHGEMM_CR shgemm_tn
|
||||
#define SHGEMM_RN shgemm_nn
|
||||
#define SHGEMM_RT shgemm_nt
|
||||
#define SHGEMM_RC shgemm_nt
|
||||
#define SHGEMM_RR shgemm_nn
|
||||
|
||||
#define SHGEMM_THREAD_NN shgemm_thread_nn
|
||||
#define SHGEMM_THREAD_CN shgemm_thread_tn
|
||||
#define SHGEMM_THREAD_TN shgemm_thread_tn
|
||||
#define SHGEMM_THREAD_NC shgemm_thread_nt
|
||||
#define SHGEMM_THREAD_NT shgemm_thread_nt
|
||||
#define SHGEMM_THREAD_CC shgemm_thread_tt
|
||||
#define SHGEMM_THREAD_CT shgemm_thread_tt
|
||||
#define SHGEMM_THREAD_TC shgemm_thread_tt
|
||||
#define SHGEMM_THREAD_TT shgemm_thread_tt
|
||||
#define SHGEMM_THREAD_NR shgemm_thread_nn
|
||||
#define SHGEMM_THREAD_TR shgemm_thread_tn
|
||||
#define SHGEMM_THREAD_CR shgemm_thread_tn
|
||||
#define SHGEMM_THREAD_RN shgemm_thread_nn
|
||||
#define SHGEMM_THREAD_RT shgemm_thread_nt
|
||||
#define SHGEMM_THREAD_RC shgemm_thread_nt
|
||||
#define SHGEMM_THREAD_RR shgemm_thread_nn
|
||||
|
||||
#endif
|
||||
|
|
@ -19,8 +19,8 @@ ifeq ($(ARCH), MIPS)
|
|||
USE_GEMM3M = 1
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
SHBLASOBJS += shgemm_nn.$(SUFFIX) shgemm_nt.$(SUFFIX) shgemm_tn.$(SUFFIX) shgemm_tt.$(SUFFIX)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SHBLASOBJS += sbgemm_nn.$(SUFFIX) sbgemm_nt.$(SUFFIX) sbgemm_tn.$(SUFFIX) sbgemm_tt.$(SUFFIX)
|
||||
endif
|
||||
|
||||
SBLASOBJS += \
|
||||
|
@ -207,8 +207,8 @@ COMMONOBJS += gemm_thread_m.$(SUFFIX) gemm_thread_n.$(SUFFIX) gemm_thread_mn.$(
|
|||
COMMONOBJS += syrk_thread.$(SUFFIX)
|
||||
|
||||
ifndef USE_SIMPLE_THREADED_LEVEL3
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
SHBLASOBJS += shgemm_thread_nn.$(SUFFIX) shgemm_thread_nt.$(SUFFIX) shgemm_thread_tn.$(SUFFIX) shgemm_thread_tt.$(SUFFIX)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SHBLASOBJS += sbgemm_thread_nn.$(SUFFIX) sbgemm_thread_nt.$(SUFFIX) sbgemm_thread_tn.$(SUFFIX) sbgemm_thread_tt.$(SUFFIX)
|
||||
endif
|
||||
SBLASOBJS += sgemm_thread_nn.$(SUFFIX) sgemm_thread_nt.$(SUFFIX) sgemm_thread_tn.$(SUFFIX) sgemm_thread_tt.$(SUFFIX)
|
||||
DBLASOBJS += dgemm_thread_nn.$(SUFFIX) dgemm_thread_nt.$(SUFFIX) dgemm_thread_tn.$(SUFFIX) dgemm_thread_tt.$(SUFFIX)
|
||||
|
@ -343,16 +343,16 @@ endif
|
|||
|
||||
all ::
|
||||
|
||||
shgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
sbgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||
|
||||
shgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
sbgemm_nt.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
|
||||
|
||||
shgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
sbgemm_tn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
|
||||
|
||||
shgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
sbgemm_tt.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
|
||||
|
||||
sgemm_nn.$(SUFFIX) : gemm.c level3.c ../../param.h
|
||||
|
@ -550,16 +550,16 @@ gemm_thread_variable.$(SUFFIX) : gemm_thread_variable.c ../../common.h
|
|||
beta_thread.$(SUFFIX) : beta_thread.c ../../common.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
shgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
sbgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||
|
||||
shgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
sbgemm_thread_nt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
|
||||
|
||||
shgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
sbgemm_thread_tn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
|
||||
|
||||
shgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
sbgemm_thread_tt.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(CFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
|
||||
|
||||
sgemm_thread_nn.$(SUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
|
@ -2735,16 +2735,16 @@ xtrsm_RCLU.$(SUFFIX) : trsm_R.c
|
|||
xtrsm_RCLN.$(SUFFIX) : trsm_R.c
|
||||
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DTRANSA -UUPPER -UUNIT -DCONJ $< -o $(@F)
|
||||
|
||||
shgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
sbgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||
|
||||
shgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
sbgemm_nt.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
|
||||
|
||||
shgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
sbgemm_tn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
|
||||
|
||||
shgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
sbgemm_tt.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
|
||||
|
||||
sgemm_nn.$(PSUFFIX) : gemm.c level3.c ../../param.h
|
||||
|
@ -2943,16 +2943,16 @@ beta_thread.$(PSUFFIX) : beta_thread.c ../../common.h
|
|||
$(CC) -c $(PFLAGS) $< -o $(@F)
|
||||
|
||||
|
||||
shgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
sbgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNN $< -o $(@F)
|
||||
|
||||
shgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
sbgemm_thread_nt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DNT $< -o $(@F)
|
||||
|
||||
shgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
sbgemm_thread_tn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTN $< -o $(@F)
|
||||
|
||||
shgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
sbgemm_thread_tt.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
$(CC) $(PFLAGS) $(BLOCKS) -c -DTHREADED_LEVEL3 -DHALF -UDOUBLE -UCOMPLEX -DTT $< -o $(@F)
|
||||
|
||||
sgemm_thread_nn.$(PSUFFIX) : gemm.c level3_thread.c ../../param.h
|
||||
|
|
|
@ -227,7 +227,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
args -> a, args -> lda,
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
#ifdef BUILD_HALF
|
||||
#ifdef BUILD_BFLOAT16
|
||||
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
|
||||
/* REAL / BFLOAT16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
|
||||
|
|
|
@ -192,7 +192,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
args -> a, args -> lda,
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
#ifdef BUILD_HALF
|
||||
#ifdef BUILD_BFLOAT16
|
||||
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
|
||||
/* REAL / BFLOAT16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
|
||||
|
|
|
@ -112,7 +112,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
|
|||
args -> a, args -> lda,
|
||||
args -> b, args -> ldb,
|
||||
args -> c, args -> ldc, sb);
|
||||
#ifdef BUILD_HALF
|
||||
#ifdef BUILD_BFLOAT16
|
||||
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
|
||||
/* REAL / BFLOAT16 */
|
||||
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
|
||||
|
|
|
@ -62,10 +62,10 @@ BLASLONG gemm_offset_b = DEFAULT_GEMM_OFFSET_B;
|
|||
BLASLONG gemm_offset_b = GEMM_OFFSET_B;
|
||||
#endif
|
||||
|
||||
#if SHGEMM_P == shgemm_p
|
||||
BLASLONG shgemm_p = DEFAULT_GEMM_P;
|
||||
#if SBGEMM_P == sbgemm_p
|
||||
BLASLONG sbgemm_p = DEFAULT_GEMM_P;
|
||||
#else
|
||||
BLASLONG shgemm_p = SHGEMM_P;
|
||||
BLASLONG sbgemm_p = SBGEMM_P;
|
||||
#endif
|
||||
#if SGEMM_P == sgemm_p
|
||||
BLASLONG sgemm_p = DEFAULT_GEMM_P;
|
||||
|
@ -88,10 +88,10 @@ BLASLONG zgemm_p = DEFAULT_GEMM_P;
|
|||
BLASLONG zgemm_p = ZGEMM_P;
|
||||
#endif
|
||||
|
||||
#if SHGEMM_Q == shgemm_q
|
||||
BLASLONG shgemm_q = DEFAULT_GEMM_Q;
|
||||
#if SBGEMM_Q == sbgemm_q
|
||||
BLASLONG sbgemm_q = DEFAULT_GEMM_Q;
|
||||
#else
|
||||
BLASLONG shgemm_q = SHGEMM_Q;
|
||||
BLASLONG sbgemm_q = SBGEMM_Q;
|
||||
#endif
|
||||
#if SGEMM_Q == sgemm_q
|
||||
BLASLONG sgemm_q = DEFAULT_GEMM_Q;
|
||||
|
@ -114,10 +114,10 @@ BLASLONG zgemm_q = DEFAULT_GEMM_Q;
|
|||
BLASLONG zgemm_q = ZGEMM_Q;
|
||||
#endif
|
||||
|
||||
#if SHGEMM_R == shgemm_r
|
||||
BLASLONG shgemm_r = DEFAULT_GEMM_R;
|
||||
#if SBGEMM_R == sbgemm_r
|
||||
BLASLONG sbgemm_r = DEFAULT_GEMM_R;
|
||||
#else
|
||||
BLASLONG shgemm_r = SHGEMM_R;
|
||||
BLASLONG sbgemm_r = SBGEMM_R;
|
||||
#endif
|
||||
#if SGEMM_R == sgemm_r
|
||||
BLASLONG sgemm_r = DEFAULT_GEMM_R;
|
||||
|
@ -615,7 +615,7 @@ void blas_set_parameter(void){
|
|||
|
||||
size = BITMASK(cpuid3, 16, 0xff);
|
||||
|
||||
shgemm_p = 192 * (size + 1);
|
||||
sbgemm_p = 192 * (size + 1);
|
||||
sgemm_p = 192 * (size + 1);
|
||||
dgemm_p = 96 * (size + 1);
|
||||
cgemm_p = 96 * (size + 1);
|
||||
|
@ -629,7 +629,7 @@ void blas_set_parameter(void){
|
|||
xgemm_p = 16 * (size + 1);
|
||||
#endif
|
||||
|
||||
shgemm_r = (((BUFFER_SIZE - ((SHGEMM_P * SHGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SHGEMM_Q * 4)) - 15) & ~15;
|
||||
sbgemm_r = (((BUFFER_SIZE - ((SBGEMM_P * SBGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SBGEMM_Q * 4)) - 15) & ~15;
|
||||
sgemm_r = (((BUFFER_SIZE - ((SGEMM_P * SGEMM_Q * 4 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (SGEMM_Q * 4)) - 15) & ~15;
|
||||
dgemm_r = (((BUFFER_SIZE - ((DGEMM_P * DGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (DGEMM_Q * 8)) - 15) & ~15;
|
||||
cgemm_r = (((BUFFER_SIZE - ((CGEMM_P * CGEMM_Q * 8 + GEMM_OFFSET_A + GEMM_ALIGN) & ~GEMM_ALIGN)) / (CGEMM_Q * 8)) - 15) & ~15;
|
||||
|
|
|
@ -30,8 +30,8 @@ ifndef BUILD_LAPACK_DEPRECATED
|
|||
BUILD_LAPACK_DEPRECATED = 0
|
||||
endif
|
||||
|
||||
ifndef BUILD_HALF
|
||||
BUILD_HALF = 0
|
||||
ifndef BUILD_BFLOAT16
|
||||
BUILD_BFLOAT16 = 0
|
||||
endif
|
||||
ifndef BUILD_SINGLE
|
||||
BUILD_SINGLE = 0
|
||||
|
@ -120,10 +120,10 @@ dll : ../$(LIBDLLNAME)
|
|||
-Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive $(FEXTRALIB) $(EXTRALIB)
|
||||
|
||||
$(LIBPREFIX).def : gensymbol
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol win2k $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
|
||||
libgoto_hpl.def : gensymbol
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol win2khpl $(ARCH) dummy $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
|
||||
ifeq ($(OSNAME), Darwin)
|
||||
INTERNALNAME = $(LIBPREFIX).$(MAJOR_VERSION).dylib
|
||||
|
@ -258,23 +258,23 @@ static : ../$(LIBNAME)
|
|||
rm -f goto.$(SUFFIX)
|
||||
|
||||
osx.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol osx $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
|
||||
aix.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol aix $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
|
||||
objcopy.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol objcopy $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
|
||||
objconv.def : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
perl ./gensymbol objconv $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16)> $(@F)
|
||||
|
||||
test : linktest.c
|
||||
$(CC) $(CFLAGS) $(LDFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) -lm && echo OK.
|
||||
rm -f linktest
|
||||
|
||||
linktest.c : gensymbol ../Makefile.system ../getarch.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_HALF) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
|
||||
perl ./gensymbol linktest $(ARCH) $(BU) $(EXPRECISION) $(NO_CBLAS) $(NO_LAPACK) $(NO_LAPACKE) $(NEED2UNDERSCORES) $(ONLY_CBLAS) "$(SYMBOLPREFIX)" "$(SYMBOLSUFFIX)" $(BUILD_LAPACK_DEPRECATED) $(BUILD_BFLOAT16) $(BUILD_SINGLE) $(BUILD_DOUBLE) $(BUILD_COMPLEX) $(BUILD_COMPLEX16) > linktest.c
|
||||
|
||||
clean ::
|
||||
@rm -f *.def *.dylib __.SYMDEF* *.renamed
|
||||
|
|
|
@ -51,7 +51,7 @@
|
|||
zgeadd, dzsum);
|
||||
|
||||
@cblasobjs = (lsame, xerbla);
|
||||
@halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod);
|
||||
@halfblasobjs = (sbgemm, sbdot, shstobf16, shdtobf16, sbf16tos, dbf16tod);
|
||||
@cblasobjsc = (
|
||||
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
|
||||
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
|
||||
|
@ -94,7 +94,7 @@
|
|||
|
||||
@cblasobjs = ( cblas_xerbla );
|
||||
|
||||
@halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod);
|
||||
@halfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod);
|
||||
|
||||
@exblasobjs = (
|
||||
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
|
||||
|
|
|
@ -9,8 +9,8 @@
|
|||
int main(int argc, char **argv) {
|
||||
|
||||
if ( (argc <= 1) || ((argc >= 2) && (*argv[1] == '0'))) {
|
||||
printf("SHGEMM_UNROLL_M=%d\n", SHGEMM_DEFAULT_UNROLL_M);
|
||||
printf("SHGEMM_UNROLL_N=%d\n", SHGEMM_DEFAULT_UNROLL_N);
|
||||
printf("SBGEMM_UNROLL_M=%d\n", SBGEMM_DEFAULT_UNROLL_M);
|
||||
printf("SBGEMM_UNROLL_N=%d\n", SBGEMM_DEFAULT_UNROLL_N);
|
||||
printf("SGEMM_UNROLL_M=%d\n", SGEMM_DEFAULT_UNROLL_M);
|
||||
printf("SGEMM_UNROLL_N=%d\n", SGEMM_DEFAULT_UNROLL_N);
|
||||
printf("DGEMM_UNROLL_M=%d\n", DGEMM_DEFAULT_UNROLL_M);
|
||||
|
|
|
@ -46,10 +46,10 @@ SBLAS3OBJS = \
|
|||
somatcopy.$(SUFFIX) simatcopy.$(SUFFIX)\
|
||||
sgeadd.$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
SHBLAS1OBJS = shdot.$(SUFFIX)
|
||||
SHBLAS3OBJS = shgemm.$(SUFFIX)
|
||||
SHEXTOBJS = shstobf16.$(SUFFIX) shdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SBBLAS1OBJS = sbdot.$(SUFFIX)
|
||||
SBBLAS3OBJS = sbgemm.$(SUFFIX)
|
||||
SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
|
||||
endif
|
||||
|
||||
DBLAS1OBJS = \
|
||||
|
@ -282,10 +282,10 @@ CSBLAS3OBJS = \
|
|||
cblas_ssyrk.$(SUFFIX) cblas_ssyr2k.$(SUFFIX) cblas_somatcopy.$(SUFFIX) cblas_simatcopy.$(SUFFIX)\
|
||||
cblas_sgeadd.$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
CSHBLAS1OBJS = cblas_shdot.$(SUFFIX)
|
||||
CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX)
|
||||
CSHEXTOBJS = cblas_shstobf16.$(SUFFIX) cblas_shdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
CBHBLAS1OBJS = cblas_sbdot.$(SUFFIX)
|
||||
CBHBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
|
||||
CBHEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
|
||||
endif
|
||||
|
||||
CDBLAS1OBJS = \
|
||||
|
@ -381,8 +381,8 @@ override CFLAGS += -I.
|
|||
SBLAS1OBJS += $(CSBLAS1OBJS)
|
||||
SBLAS2OBJS += $(CSBLAS2OBJS)
|
||||
SBLAS3OBJS += $(CSBLAS3OBJS)
|
||||
SHBLAS1OBJS += $(CSHBLAS1OBJS)
|
||||
SHBLAS3OBJS += $(CSHBLAS3OBJS)
|
||||
SBBLAS1OBJS += $(CSBBLAS1OBJS)
|
||||
SBBLAS3OBJS += $(CSBBLAS3OBJS)
|
||||
DBLAS1OBJS += $(CDBLAS1OBJS)
|
||||
DBLAS2OBJS += $(CDBLAS2OBJS)
|
||||
DBLAS3OBJS += $(CDBLAS3OBJS)
|
||||
|
@ -393,13 +393,13 @@ ZBLAS1OBJS += $(CZBLAS1OBJS)
|
|||
ZBLAS2OBJS += $(CZBLAS2OBJS)
|
||||
ZBLAS3OBJS += $(CZBLAS3OBJS)
|
||||
|
||||
SHEXTOBJS += $(CSHEXTOBJS)
|
||||
SBEXTOBJS += $(CSBEXTOBJS)
|
||||
|
||||
CBAUXOBJS += $(CXERBLAOBJ)
|
||||
endif
|
||||
|
||||
SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
|
||||
SHBLASOBJS = $(SHBLAS1OBJS) $(SHBLAS3OBJS)
|
||||
SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS)
|
||||
DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
|
||||
QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
|
||||
CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
|
||||
|
@ -506,7 +506,7 @@ ifneq ($(BUILD_COMPLEX16),1)
|
|||
ZBLASOBJS=
|
||||
endif
|
||||
|
||||
FUNCOBJS = $(SHEXTOBJS) $(CXERBLAOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
|
||||
FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
|
||||
$(info FUNCOBJS = {[$(FUNCOBJS)]} )
|
||||
ifdef EXPRECISION
|
||||
FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS)
|
||||
|
@ -772,8 +772,8 @@ sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c
|
|||
dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
shdot.$(SUFFIX) shdot.$(PSUFFIX) : bf16dot.c
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
sbdot.$(SUFFIX) sbdot.$(PSUFFIX) : bf16dot.c
|
||||
$(CC) $(CFLAGS) -c $< -o $(@F)
|
||||
shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c
|
||||
$(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F)
|
||||
|
@ -1278,8 +1278,8 @@ zhpr2.$(SUFFIX) zhpr2.$(PSUFFIX) : zhpr2.c
|
|||
xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
shgemm.$(SUFFIX) shgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
|
@ -1523,8 +1523,8 @@ cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
|
|||
cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
cblas_shdot.$(SUFFIX) cblas_shdot.$(PSUFFIX) : bf16dot.c
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
cblas_sbdot.$(SUFFIX) cblas_sbdot.$(PSUFFIX) : bf16dot.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
|
||||
cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c
|
||||
$(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F)
|
||||
|
@ -1857,8 +1857,8 @@ cblas_zhemv.$(SUFFIX) cblas_zhemv.$(PSUFFIX) : zhemv.c
|
|||
cblas_sgemm.$(SUFFIX) cblas_sgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
cblas_shgemm.$(SUFFIX) cblas_shgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
cblas_sbgemm.$(SUFFIX) cblas_sbgemm.$(PSUFFIX) : gemm.c ../param.h
|
||||
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
|
||||
endif
|
||||
|
||||
|
|
|
@ -41,8 +41,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
foreach (float_type ${FLOAT_TYPES})
|
||||
# a bit of metaprogramming here to pull out the appropriate KERNEL var
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "HALF")
|
||||
set (float_char "SH")
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMAXKERNEL}" "USE_ABS" "amax_k" false "" "" false ${float_type})
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}AMINKERNEL}" "USE_ABS;USE_MIN" "amin_k" false "" "" false ${float_type})
|
||||
|
@ -149,8 +149,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("generic/ger.c" "" "ger_k" false "" "" "" 3)
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "HALF")
|
||||
set (float_char "SH")
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GERUKERNEL}" "" "geru_k" false "" "" false ${float_type})
|
||||
|
@ -208,13 +208,13 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
GenerateNamedObjects("${KERNELDIR}/${SGEMMDIRECTPERFORMANT}" "" "gemm_direct_performant" false "" "" false SINGLE)
|
||||
endif()
|
||||
|
||||
foreach (float_type SINGLE DOUBLE HALF)
|
||||
foreach (float_type SINGLE DOUBLE BFLOAT16)
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "HALF")
|
||||
if (NOT ${BUILD_HALF})
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
if (NOT ${BUILD_BFLOAT16})
|
||||
continue ()
|
||||
else ()
|
||||
set (float_char "SH")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
endif ()
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMKERNEL}" "" "gemm_kernel" false "" "" false ${float_type})
|
||||
|
@ -254,8 +254,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "HALF")
|
||||
set (float_char "SH")
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (${float_char}GEMMINCOPY)
|
||||
GenerateNamedObjects("${KERNELDIR}/${${float_char}GEMMINCOPY}" "${float_type}" "${${float_char}GEMMINCOPYOBJ}" false "" "" true ${float_type})
|
||||
|
@ -620,8 +620,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
# Makefile.LA
|
||||
if(NOT NO_LAPACK)
|
||||
foreach (float_type ${FLOAT_TYPES})
|
||||
if (${float_type} STREQUAL "HALF")
|
||||
set (float_char "SH")
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
if (NOT DEFINED ${float_char}NEG_TCOPY)
|
||||
if (${float_char} STREQUAL "Z" OR ${float_char} STREQUAL "C" OR ${float_char} STREQUAL "X")
|
||||
|
@ -688,8 +688,8 @@ function (build_core TARGET_CORE KDIR TSUFFIX KERNEL_DEFINITIONS)
|
|||
foreach (float_type ${FLOAT_TYPES})
|
||||
# a bit of metaprogramming here to pull out the appropriate KERNEL var
|
||||
string(SUBSTRING ${float_type} 0 1 float_char)
|
||||
if (${float_type} STREQUAL "HALF")
|
||||
set (float_char "SH")
|
||||
if (${float_type} STREQUAL "BFLOAT16")
|
||||
set (float_char "SB")
|
||||
endif ()
|
||||
GenerateNamedObjects("generic/neg_tcopy_${${float_char}GEMM_UNROLL_M}.c" "" "neg_tcopy" false "" ${TSUFFIX} false ${float_type})
|
||||
GenerateNamedObjects("generic/laswp_ncopy_${${float_char}GEMM_UNROLL_N}.c" "" "laswp_ncopy" false "" ${TSUFFIX} false ${float_type})
|
||||
|
|
|
@ -262,9 +262,9 @@ ifndef XDOTKERNEL
|
|||
XDOTKERNEL = zdot.S
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
ifndef SHDOTKERNEL
|
||||
SHDOTKERNEL = ../x86_64/shdot.c
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
ifndef SBDOTKERNEL
|
||||
SBDOTKERNEL = ../x86_64/sbdot.c
|
||||
endif
|
||||
|
||||
ifndef TOBF16KERNEL
|
||||
|
@ -530,11 +530,11 @@ XBLASOBJS += \
|
|||
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
|
||||
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SHBLASOBJS += \
|
||||
shdot_k$(TSUFFIX).$(SUFFIX)
|
||||
sbdot_k$(TSUFFIX).$(SUFFIX)
|
||||
SHEXTOBJS += \
|
||||
shstobf16_k$(TSUFFIX).$(SUFFIX) shdtobf16_k$(TSUFFIX).$(SUFFIX)
|
||||
sbstobf16_k$(TSUFFIX).$(SUFFIX) sbdtobf16_k$(TSUFFIX).$(SUFFIX)
|
||||
SHEXTOBJS += \
|
||||
sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
@ -757,12 +757,12 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
|
|||
$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
$(KDIR)shdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)shdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHDOTKERNEL)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
$(KDIR)sbdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sbdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBDOTKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
|
||||
$(KDIR)shstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
|
||||
$(KDIR)sbstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@
|
||||
$(KDIR)shdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
|
||||
$(KDIR)sbdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
|
||||
$(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@
|
||||
$(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL)
|
||||
$(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@
|
||||
|
|
|
@ -80,24 +80,24 @@ SGEMMDIRECTPERFORMANT = sgemm_direct_performant.c
|
|||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
ifndef SHGEMMKERNEL
|
||||
SHGEMM_BETA = ../generic/gemm_beta.c
|
||||
SHGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SHGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
SHGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
SHGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SHGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
ifndef SBGEMMKERNEL
|
||||
SBGEMM_BETA = ../generic/gemm_beta.c
|
||||
SBGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||
SBGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||
SBGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||
SBGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||
SBGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
SHKERNELOBJS += \
|
||||
shgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(SHGEMMINCOPYOBJ) $(SHGEMMITCOPYOBJ) \
|
||||
$(SHGEMMONCOPYOBJ) $(SHGEMMOTCOPYOBJ)
|
||||
sbgemm_kernel$(TSUFFIX).$(SUFFIX) \
|
||||
$(SBGEMMINCOPYOBJ) $(SBGEMMITCOPYOBJ) \
|
||||
$(SBGEMMONCOPYOBJ) $(SBGEMMOTCOPYOBJ)
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE),$(BUILD_COMPLEX))" ""
|
||||
|
@ -149,7 +149,7 @@ XKERNELOBJS += \
|
|||
$(XGEMMINCOPYOBJ) $(XGEMMITCOPYOBJ) \
|
||||
$(XGEMMONCOPYOBJ) $(XGEMMOTCOPYOBJ)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SHBLASOBJS += $(SHKERNELOBJS)
|
||||
endif
|
||||
SBLASOBJS += $(SKERNELOBJS)
|
||||
|
@ -159,8 +159,8 @@ CBLASOBJS += $(CKERNELOBJS)
|
|||
ZBLASOBJS += $(ZKERNELOBJS)
|
||||
XBLASOBJS += $(XKERNELOBJS)
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
SHBLASOBJS += shgemm_beta$(TSUFFIX).$(SUFFIX)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
SHBLASOBJS += sbgemm_beta$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifneq "$(or $(BUILD_SINGLE),$(BUILD_DOUBLE))" ""
|
||||
|
@ -492,11 +492,11 @@ ZBLASOBJS += \
|
|||
zgeadd_k$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
SHGEMMINCOPYOBJ_P = $(SHGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SHGEMMITCOPYOBJ_P = $(SHGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SHGEMMONCOPYOBJ_P = $(SHGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SHGEMMOTCOPYOBJ_P = $(SHGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
SBGEMMINCOPYOBJ_P = $(SBGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SBGEMMITCOPYOBJ_P = $(SBGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SBGEMMONCOPYOBJ_P = $(SBGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
SBGEMMOTCOPYOBJ_P = $(SBGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
endif
|
||||
|
||||
SGEMMINCOPYOBJ_P = $(SGEMMINCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
|
@ -524,9 +524,9 @@ XGEMMITCOPYOBJ_P = $(XGEMMITCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
|||
XGEMMONCOPYOBJ_P = $(XGEMMONCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
XGEMMOTCOPYOBJ_P = $(XGEMMOTCOPYOBJ:.$(SUFFIX)=.$(PSUFFIX))
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
$(KDIR)shgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
$(KDIR)sbgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
||||
|
@ -548,35 +548,35 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
|||
$(CC) $(CFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
||||
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
|
||||
$(KDIR)$(SHGEMMONCOPYOBJ) : $(KERNELDIR)/$(SHGEMMONCOPY)
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
$(KDIR)$(SBGEMMONCOPYOBJ) : $(KERNELDIR)/$(SBGEMMONCOPY)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SHGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SHGEMMOTCOPY)
|
||||
$(KDIR)$(SBGEMMOTCOPYOBJ) : $(KERNELDIR)/$(SBGEMMOTCOPY)
|
||||
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmotcopy.s
|
||||
m4 shgemmotcopy.s > shgemmotcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmotcopy_nomacros.s -o $@
|
||||
rm shgemmotcopy.s shgemmotcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmotcopy.s
|
||||
m4 sbgemmotcopy.s > sbgemmotcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmotcopy_nomacros.s -o $@
|
||||
rm sbgemmotcopy.s sbgemmotcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
|
||||
ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
|
||||
|
||||
$(KDIR)$(SHGEMMINCOPYOBJ) : $(KERNELDIR)/$(SHGEMMINCOPY)
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
$(KDIR)$(SBGEMMINCOPYOBJ) : $(KERNELDIR)/$(SBGEMMINCOPY)
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(KDIR)$(SHGEMMITCOPYOBJ) : $(KERNELDIR)/$(SHGEMMITCOPY)
|
||||
$(KDIR)$(SBGEMMITCOPYOBJ) : $(KERNELDIR)/$(SBGEMMITCOPY)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemmitcopy.s
|
||||
m4 shgemmitcopy.s > shgemmitcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemmitcopy_nomacros.s -o $@
|
||||
rm shgemmitcopy.s shgemmitcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemmitcopy.s
|
||||
m4 sbgemmitcopy.s > sbgemmitcopy_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemmitcopy_nomacros.s -o $@
|
||||
rm sbgemmitcopy.s sbgemmitcopy_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
endif
|
||||
|
@ -746,16 +746,16 @@ $(KDIR)sgemm_direct$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMDIRECTKERNEL)
|
|||
$(CC) $(CFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
|
||||
$(KDIR)shgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
|
||||
$(KDIR)sbgemm_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
|
||||
ifeq ($(OS), AIX)
|
||||
$(CC) $(CFLAGS) -S -DHALF -UDOUBLE -UCOMPLEX $< -o - > shgemm_kernel$(TSUFFIX).s
|
||||
m4 shgemm_kernel$(TSUFFIX).s > shgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX shgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm shgemm_kernel$(TSUFFIX).s shgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -S -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o - > sbgemm_kernel$(TSUFFIX).s
|
||||
m4 sbgemm_kernel$(TSUFFIX).s > sbgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX sbgemm_kernel$(TSUFFIX)_nomacros.s -o $@
|
||||
rm sbgemm_kernel$(TSUFFIX).s sbgemm_kernel$(TSUFFIX)_nomacros.s
|
||||
else
|
||||
$(CC) $(CFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
$(CC) $(CFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
endif
|
||||
|
||||
|
@ -2375,9 +2375,9 @@ $(KDIR)xtrsm_oltncopy$(TSUFFIX).$(SUFFIX) : generic/ztrsm_ltcopy_$(XGEMM_UNROLL_
|
|||
$(KDIR)sgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
$(KDIR)shgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
$(KDIR)sbgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMM_BETA)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)dgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(DGEMM_BETA)
|
||||
|
@ -2396,19 +2396,19 @@ $(KDIR)xgemm_beta$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XGEMM_BETA)
|
|||
$(CC) $(PFLAGS) -c -DXDOUBLE -DCOMPLEX $< -o $@
|
||||
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
$(SHGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
$(SBGEMMONCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMONCOPY)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(SHGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMOTCOPY)
|
||||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
$(SBGEMMOTCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMOTCOPY)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
ifneq ($(SHGEMM_UNROLL_M), $(SHGEMM_UNROLL_N))
|
||||
$(SHGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMINCOPY)
|
||||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
ifneq ($(SBGEMM_UNROLL_M), $(SBGEMM_UNROLL_N))
|
||||
$(SBGEMMINCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMINCOPY)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
$(SHGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SHGEMMITCOPY)
|
||||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
$(SBGEMMITCOPYOBJ_P) : $(KERNELDIR)/$(SBGEMMITCOPY)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
|
||||
endif
|
||||
endif
|
||||
|
@ -2518,9 +2518,9 @@ endif
|
|||
endif
|
||||
|
||||
|
||||
ifeq ($(BUILD_HALF), 1)
|
||||
$(KDIR)shgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHGEMMKERNEL) $(SHGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -DHALF -UDOUBLE -UCOMPLEX $< -o $@
|
||||
ifeq ($(BUILD_BFLOAT16), 1)
|
||||
$(KDIR)sbgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMMKERNEL) $(SBGEMMDEPEND)
|
||||
$(CC) $(PFLAGS) -c -DBFLOAT16 -UDOUBLE -UCOMPLEX $< -o $@
|
||||
endif
|
||||
|
||||
$(KDIR)sgemm_kernel$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(SGEMMDEPEND)
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
#include "common.h"
|
||||
#if defined(HALF) && defined(HALFCONVERSION)
|
||||
#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION)
|
||||
static float
|
||||
bfloat16tof32 (bfloat16 f16)
|
||||
{
|
||||
|
|
|
@ -7,16 +7,16 @@ else
|
|||
#CGEMM_BETA = ../generic/zgemm_beta.c
|
||||
#ZGEMM_BETA = ../generic/zgemm_beta.c
|
||||
|
||||
SHGEMM_BETA = ../generic/gemm_beta.c
|
||||
SHGEMMKERNEL = shgemm_kernel_power10.c
|
||||
SHGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SHGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SHGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SHGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
SHGEMMINCOPYOBJ = shgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMITCOPYOBJ = shgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMONCOPYOBJ = shgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SHGEMMOTCOPYOBJ = shgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMM_BETA = ../generic/gemm_beta.c
|
||||
SBGEMMKERNEL = sbgemm_kernel_power10.c
|
||||
SBGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||
SBGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||
SBGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||
SBGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||
SBGEMMINCOPYOBJ = sbgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMITCOPYOBJ = sbgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMONCOPYOBJ = sbgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SBGEMMOTCOPYOBJ = sbgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
STRMMKERNEL = sgemm_kernel_power10.c
|
||||
DTRMMKERNEL = dgemm_kernel_power10.c
|
||||
|
|
|
@ -26,7 +26,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
**********************************************************************************/
|
||||
#include "common.h"
|
||||
#include <altivec.h>
|
||||
#if defined(HALF) && defined(HALFCONVERSION)
|
||||
#if defined(BFLOAT16) && defined(BFLOAT16CONVERSION)
|
||||
static float
|
||||
bfloat16tof32 (bfloat16 f16)
|
||||
{
|
||||
|
@ -131,7 +131,7 @@ vector char mask =
|
|||
|
||||
#define PREFETCH1(x, y) asm volatile ("dcbt %0, %1" : : "r" (x), "b" (y) : "memory");
|
||||
/*************************************************************************************
|
||||
* SHGEMM Kernel
|
||||
* SBGEMM Kernel
|
||||
*************************************************************************************/
|
||||
int
|
||||
CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, IFLOAT * A,
|
|
@ -53,32 +53,32 @@ gotoblas_t TABLE_NAME = {
|
|||
|
||||
GEMM_DEFAULT_OFFSET_A, GEMM_DEFAULT_OFFSET_B, GEMM_DEFAULT_ALIGN,
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
#ifdef BUILD_BFLOAT16
|
||||
0, 0, 0,
|
||||
SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N,
|
||||
#ifdef SHGEMM_DEFAULT_UNROLL_MN
|
||||
SHGEMM_DEFAULT_UNROLL_MN,
|
||||
SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N,
|
||||
#ifdef SBGEMM_DEFAULT_UNROLL_MN
|
||||
SBGEMM_DEFAULT_UNROLL_MN,
|
||||
#else
|
||||
MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N),
|
||||
MAX(SBGEMM_DEFAULT_UNROLL_M, SBGEMM_DEFAULT_UNROLL_N),
|
||||
#endif
|
||||
|
||||
shstobf16_kTS, shdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
|
||||
sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
|
||||
|
||||
samax_kTS, samin_kTS, smax_kTS, smin_kTS,
|
||||
isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
|
||||
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, shdot_kTS,
|
||||
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS,
|
||||
dsdot_kTS,
|
||||
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
|
||||
sgemv_nTS, sgemv_tTS, sger_kTS,
|
||||
ssymv_LTS, ssymv_UTS,
|
||||
|
||||
shgemm_kernelTS, shgemm_betaTS,
|
||||
#if SHGEMM_DEFAULT_UNROLL_M != SHGEMM_DEFAULT_UNROLL_N
|
||||
shgemm_incopyTS, shgemm_itcopyTS,
|
||||
sbgemm_kernelTS, sbgemm_betaTS,
|
||||
#if SBGEMM_DEFAULT_UNROLL_M != SBGEMM_DEFAULT_UNROLL_N
|
||||
sbgemm_incopyTS, sbgemm_itcopyTS,
|
||||
#else
|
||||
shgemm_oncopyTS, shgemm_otcopyTS,
|
||||
sbgemm_oncopyTS, sbgemm_otcopyTS,
|
||||
#endif
|
||||
shgemm_oncopyTS, shgemm_otcopyTS,
|
||||
sbgemm_oncopyTS, sbgemm_otcopyTS,
|
||||
|
||||
strsm_kernel_LNTS, strsm_kernel_LTTS, strsm_kernel_RNTS, strsm_kernel_RTTS,
|
||||
#if SGEMM_DEFAULT_UNROLL_M != SGEMM_DEFAULT_UNROLL_N
|
||||
|
@ -830,8 +830,8 @@ gotoblas_t TABLE_NAME = {
|
|||
|
||||
#if (ARCH_ARM64)
|
||||
static void init_parameter(void) {
|
||||
#if (BUILD_HALF)
|
||||
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
|
||||
#if (BUILD_BFLOAT16)
|
||||
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
|
@ -846,8 +846,8 @@ static void init_parameter(void) {
|
|||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||
#endif
|
||||
|
||||
#if (BUILD_HALF)
|
||||
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
|
||||
#if (BUILD_BFLOAT16)
|
||||
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
#if BUILD_SINGLE == 1
|
||||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
|
||||
|
@ -862,8 +862,8 @@ static void init_parameter(void) {
|
|||
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
|
||||
#if (BUILD_HALF)
|
||||
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
|
||||
#if (BUILD_BFLOAT16)
|
||||
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
|
||||
#endif
|
||||
#if BUILD_SINGLE == 1
|
||||
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
|
||||
|
@ -936,16 +936,16 @@ static void init_parameter(void) {
|
|||
#if (ARCH_POWER)
|
||||
static void init_parameter(void) {
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
|
||||
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
|
||||
|
@ -953,8 +953,8 @@ static void init_parameter(void) {
|
|||
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
|
||||
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
|
||||
|
@ -965,16 +965,16 @@ static void init_parameter(void) {
|
|||
|
||||
#if (ARCH_ZARCH)
|
||||
static void init_parameter(void) {
|
||||
#ifdef BUILD_HALF
|
||||
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
|
||||
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
|
||||
|
@ -982,8 +982,8 @@ static void init_parameter(void) {
|
|||
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
|
||||
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
|
||||
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
|
||||
|
@ -1124,10 +1124,10 @@ static void init_parameter(void) {
|
|||
(void) l2; /* dirty trick to suppress unused variable warning for targets */
|
||||
/* where the GEMM unrolling parameters do not depend on l2 */
|
||||
|
||||
#ifdef BUILD_HALF
|
||||
TABLE_NAME.shgemm_p = SHGEMM_DEFAULT_P;
|
||||
TABLE_NAME.shgemm_r = SHGEMM_DEFAULT_R;
|
||||
TABLE_NAME.shgemm_q = SHGEMM_DEFAULT_Q;
|
||||
#ifdef BUILD_BFLOAT16
|
||||
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
|
||||
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
|
||||
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
|
||||
#endif
|
||||
#if (BUILD_SINGLE==1) || (BUILD_COMPLEX==1)
|
||||
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
|
||||
|
|
|
@ -146,8 +146,8 @@ ifndef XDOTKERNEL
|
|||
XDOTKERNEL = zdot.S
|
||||
endif
|
||||
|
||||
ifndef SHDOTKERNEL
|
||||
SHDOTKERNEL = shdot.c
|
||||
ifndef SBDOTKERNEL
|
||||
SBDOTKERNEL = sbdot.c
|
||||
endif
|
||||
|
||||
ifndef TOBF16KERNEL
|
||||
|
|
|
@ -28,16 +28,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
|
||||
#if defined(COOPERLAKE)
|
||||
#include "shdot_microk_cooperlake.c"
|
||||
#include "sbdot_microk_cooperlake.c"
|
||||
#endif
|
||||
|
||||
static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y)
|
||||
static float sbdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y)
|
||||
{
|
||||
float d = 0.0;
|
||||
|
||||
#ifdef HAVE_SHDOT_ACCL_KERNEL
|
||||
#ifdef HAVE_SBDOT_ACCL_KERNEL
|
||||
if ((inc_x == 1) && (inc_y == 1)) {
|
||||
return shdot_accl_kernel(n, x, y);
|
||||
return sbdot_accl_kernel(n, x, y);
|
||||
}
|
||||
#endif
|
||||
|
||||
|
@ -56,11 +56,11 @@ static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y,
|
|||
}
|
||||
|
||||
#if defined(SMP)
|
||||
static int shdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2,
|
||||
static int sbdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2,
|
||||
bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y,
|
||||
float *result, BLASLONG dummy3)
|
||||
{
|
||||
*(float *)result = shdot_compute(n, x, inc_x, y, inc_y);
|
||||
*(float *)result = sbdot_compute(n, x, inc_x, y, inc_y);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
@ -94,13 +94,13 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y
|
|||
}
|
||||
|
||||
if (nthreads <= 1) {
|
||||
dot_result = shdot_compute(n, x, inc_x, y, inc_y);
|
||||
dot_result = sbdot_compute(n, x, inc_x, y, inc_y);
|
||||
} else {
|
||||
char thread_result[MAX_CPU_NUMBER * sizeof(double) * 2];
|
||||
int mode = BLAS_BFLOAT16 | BLAS_REAL;
|
||||
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
|
||||
x, inc_x, y, inc_y, thread_result, 0,
|
||||
(void *)shdot_thread_func, nthreads);
|
||||
(void *)sbdot_thread_func, nthreads);
|
||||
float * ptr = (float *)thread_result;
|
||||
for (int i = 0; i < nthreads; i++) {
|
||||
dot_result += (*ptr);
|
||||
|
@ -108,7 +108,7 @@ float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y
|
|||
}
|
||||
}
|
||||
#else
|
||||
dot_result = shdot_compute(n, x, inc_x, y, inc_y);
|
||||
dot_result = sbdot_compute(n, x, inc_x, y, inc_y);
|
||||
#endif
|
||||
|
||||
return dot_result;
|
|
@ -28,11 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* need a new enough GCC for avx512 support */
|
||||
#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
|
||||
|
||||
#define HAVE_SHDOT_ACCL_KERNEL 1
|
||||
#define HAVE_SBDOT_ACCL_KERNEL 1
|
||||
#include "common.h"
|
||||
#include <immintrin.h>
|
||||
|
||||
static float shdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
|
||||
static float sbdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
|
||||
{
|
||||
__m128 accum128 = _mm_setzero_ps();
|
||||
if (n> 127) { /* n range from 128 to inf. */
|
|
@ -382,7 +382,7 @@ static int thread_driver(blas_arg_t *args, FLOAT *sa, FLOAT *sb){
|
|||
mask = MAX(DGEMM_UNROLL_M, DGEMM_UNROLL_N) - 1;
|
||||
#elif defined(HALF)
|
||||
mode = BLAS_HALF | BLAS_REAL;
|
||||
mask = MAX(SHGEMM_UNROLL_M, SHGEMM_UNROLL_N) - 1;
|
||||
mask = MAX(SBGEMM_UNROLL_M, SBGEMM_UNROLL_N) - 1;
|
||||
#else
|
||||
mode = BLAS_SINGLE | BLAS_REAL;
|
||||
mask = MAX(SGEMM_UNROLL_M, SGEMM_UNROLL_N) - 1;
|
||||
|
|
32
param.h
32
param.h
|
@ -72,12 +72,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#ifndef PARAM_H
|
||||
#define PARAM_H
|
||||
|
||||
#define SHGEMM_DEFAULT_UNROLL_N 4
|
||||
#define SHGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SHGEMM_DEFAULT_UNROLL_MN 32
|
||||
#define SHGEMM_DEFAULT_P 256
|
||||
#define SHGEMM_DEFAULT_R 256
|
||||
#define SHGEMM_DEFAULT_Q 256
|
||||
#define SBGEMM_DEFAULT_UNROLL_N 4
|
||||
#define SBGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SBGEMM_DEFAULT_UNROLL_MN 32
|
||||
#define SBGEMM_DEFAULT_P 256
|
||||
#define SBGEMM_DEFAULT_R 256
|
||||
#define SBGEMM_DEFAULT_Q 256
|
||||
#ifdef OPTERON
|
||||
|
||||
#define SNUMOPT 4
|
||||
|
@ -2426,16 +2426,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#endif
|
||||
|
||||
#if defined(POWER10)
|
||||
#undef SHGEMM_DEFAULT_UNROLL_N
|
||||
#undef SHGEMM_DEFAULT_UNROLL_M
|
||||
#undef SHGEMM_DEFAULT_P
|
||||
#undef SHGEMM_DEFAULT_R
|
||||
#undef SHGEMM_DEFAULT_Q
|
||||
#define SHGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SHGEMM_DEFAULT_UNROLL_N 8
|
||||
#define SHGEMM_DEFAULT_P 832
|
||||
#define SHGEMM_DEFAULT_Q 1026
|
||||
#define SHGEMM_DEFAULT_R 4096
|
||||
#undef SBGEMM_DEFAULT_UNROLL_N
|
||||
#undef SBGEMM_DEFAULT_UNROLL_M
|
||||
#undef SBGEMM_DEFAULT_P
|
||||
#undef SBGEMM_DEFAULT_R
|
||||
#undef SBGEMM_DEFAULT_Q
|
||||
#define SBGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SBGEMM_DEFAULT_UNROLL_N 8
|
||||
#define SBGEMM_DEFAULT_P 832
|
||||
#define SBGEMM_DEFAULT_Q 1026
|
||||
#define SBGEMM_DEFAULT_R 4096
|
||||
#endif
|
||||
|
||||
#if defined(SPARC) && defined(V7)
|
||||
|
|
|
@ -214,16 +214,16 @@ endif
|
|||
|
||||
|
||||
|
||||
#ifeq ($(BUILD_HALF),1)
|
||||
#level3 : test_shgemm sblat3 dblat3 cblat3 zblat3
|
||||
#ifeq ($(BUILD_BFLOAT16),1)
|
||||
#level3 : test_sbgemm sblat3 dblat3 cblat3 zblat3
|
||||
#else
|
||||
#level3 : sblat3 dblat3 cblat3 zblat3
|
||||
#endif
|
||||
|
||||
ifndef CROSS
|
||||
rm -f ?BLAT3.SUMM
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_shgemm > SHBLAT3.SUMM
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./test_sbgemm > SHBLAT3.SUMM
|
||||
@$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0
|
||||
endif
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
@ -245,8 +245,8 @@ endif
|
|||
ifdef SMP
|
||||
rm -f ?BLAT3.SUMM
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
OMP_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
OMP_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM
|
||||
@$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0
|
||||
endif
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
@ -266,8 +266,8 @@ ifeq ($(BUILD_COMPLEX16),1)
|
|||
@$(GREP) -q FATAL ZBLAT3.SUMM && cat ZBLAT3.SUMM || exit 0
|
||||
endif
|
||||
else
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
OPENBLAS_NUM_THREADS=2 ./test_shgemm > SHBLAT3.SUMM
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
OPENBLAS_NUM_THREADS=2 ./test_sbgemm > SHBLAT3.SUMM
|
||||
@$(GREP) -q FATAL SHBLAT3.SUMM && cat SHBLAT3.SUMM || exit 0
|
||||
endif
|
||||
ifeq ($(BUILD_SINGLE),1)
|
||||
|
@ -377,9 +377,9 @@ zblat3 : zblat3.$(SUFFIX) ../$(LIBNAME)
|
|||
$(FC) $(FLDFLAGS) -o zblat3 zblat3.$(SUFFIX) ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
test_shgemm : compare_sgemm_shgemm.c ../$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o test_shgemm compare_sgemm_shgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
|
||||
ifeq ($(BUILD_BFLOAT16),1)
|
||||
test_sbgemm : compare_sgemm_sbgemm.c ../$(LIBNAME)
|
||||
$(FC) $(FLDFLAGS) -o test_sbgemm compare_sgemm_sbgemm.c ../$(LIBNAME) $(EXTRALIB) $(CEXTRALIB)
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_COMPLEX),1)
|
||||
|
@ -398,7 +398,7 @@ clean:
|
|||
@rm -f *.$(SUFFIX) *.$(PSUFFIX) gmon.$(SUFFIX)ut *.SUMM *.cxml *.exe *.pdb *.dwf \
|
||||
sblat1 dblat1 cblat1 zblat1 \
|
||||
sblat2 dblat2 cblat2 zblat2 \
|
||||
test_shgemm sblat3 dblat3 cblat3 zblat3 \
|
||||
test_sbgemm sblat3 dblat3 cblat3 zblat3 \
|
||||
sblat1p dblat1p cblat1p zblat1p \
|
||||
sblat2p dblat2p cblat2p zblat2p \
|
||||
sblat3p dblat3p cblat3p zblat3p \
|
||||
|
|
|
@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <stdint.h>
|
||||
#include "../common.h"
|
||||
#define SGEMM BLASFUNC(sgemm)
|
||||
#define SHGEMM BLASFUNC(shgemm)
|
||||
#define SBGEMM BLASFUNC(sbgemm)
|
||||
typedef union
|
||||
{
|
||||
unsigned short v;
|
||||
|
@ -102,7 +102,7 @@ main (int argc, char *argv[])
|
|||
}
|
||||
SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
|
||||
&m, B, &k, &beta, C, &m);
|
||||
SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
|
||||
SBGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
|
||||
&m, BB, &k, &beta, CC, &m);
|
||||
for (i = 0; i < n; i++)
|
||||
for (j = 0; j < m; j++)
|
||||
|
@ -126,6 +126,6 @@ main (int argc, char *argv[])
|
|||
}
|
||||
}
|
||||
if (ret != 0)
|
||||
fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret);
|
||||
fprintf (stderr, "FATAL ERROR SBGEMM - Return code: %d\n", ret);
|
||||
return ret;
|
||||
}
|
Loading…
Reference in New Issue