Merge pull request #4048 from imzhuhl/spr_sbgemm_fix
Sapphire Rapids sbgemm fix
This commit is contained in:
commit
c3a2d407a0
|
@ -645,7 +645,7 @@ DYNAMIC_CORE += HASWELL ZEN
|
||||||
endif
|
endif
|
||||||
ifneq ($(NO_AVX512), 1)
|
ifneq ($(NO_AVX512), 1)
|
||||||
ifneq ($(NO_AVX2), 1)
|
ifneq ($(NO_AVX2), 1)
|
||||||
DYNAMIC_CORE += SKYLAKEX COOPERLAKE
|
DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -82,7 +82,7 @@ if (DYNAMIC_ARCH)
|
||||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN)
|
||||||
endif ()
|
endif ()
|
||||||
if (NOT NO_AVX512)
|
if (NOT NO_AVX512)
|
||||||
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE)
|
set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS)
|
||||||
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}")
|
||||||
endif ()
|
endif ()
|
||||||
if (DYNAMIC_LIST)
|
if (DYNAMIC_LIST)
|
||||||
|
|
|
@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
||||||
#message(STATUS "skipping ${makefile_line}")
|
#message(STATUS "skipping ${makefile_line}")
|
||||||
continue ()
|
continue ()
|
||||||
endif ()
|
endif ()
|
||||||
|
|
||||||
|
# Example 1: SBGEMM_SMALL_M_PERMIT =
|
||||||
|
# Unset the variable
|
||||||
|
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}")
|
||||||
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
|
set(var_name ${CMAKE_MATCH_1})
|
||||||
|
unset(${var_name})
|
||||||
|
endif()
|
||||||
|
|
||||||
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}")
|
||||||
if (NOT "${line_match}" STREQUAL "")
|
if (NOT "${line_match}" STREQUAL "")
|
||||||
#message(STATUS "match on ${line_match}")
|
#message(STATUS "match on ${line_match}")
|
||||||
|
|
|
@ -53,6 +53,7 @@ typedef struct {
|
||||||
int sbgemm_p, sbgemm_q, sbgemm_r;
|
int sbgemm_p, sbgemm_q, sbgemm_r;
|
||||||
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
|
int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn;
|
||||||
int sbgemm_align_k;
|
int sbgemm_align_k;
|
||||||
|
int need_amxtile_permission; // 0 default, 1 for device support amx.
|
||||||
|
|
||||||
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
|
||||||
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
|
||||||
|
|
|
@ -1479,6 +1479,8 @@ int get_cpuname(void){
|
||||||
else
|
else
|
||||||
return CPUTYPE_NEHALEM;
|
return CPUTYPE_NEHALEM;
|
||||||
case 15: // Sapphire Rapids
|
case 15: // Sapphire Rapids
|
||||||
|
if(support_amx_bf16())
|
||||||
|
return CPUTYPE_SAPPHIRERAPIDS;
|
||||||
if(support_avx512_bf16())
|
if(support_avx512_bf16())
|
||||||
return CPUTYPE_COOPERLAKE;
|
return CPUTYPE_COOPERLAKE;
|
||||||
if(support_avx512())
|
if(support_avx512())
|
||||||
|
@ -1845,7 +1847,8 @@ static char *cpuname[] = {
|
||||||
"ZEN",
|
"ZEN",
|
||||||
"SKYLAKEX",
|
"SKYLAKEX",
|
||||||
"DHYANA",
|
"DHYANA",
|
||||||
"COOPERLAKE"
|
"COOPERLAKE",
|
||||||
|
"SAPPHIRERAPIDS",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *lowercpuname[] = {
|
static char *lowercpuname[] = {
|
||||||
|
@ -1902,7 +1905,8 @@ static char *lowercpuname[] = {
|
||||||
"zen",
|
"zen",
|
||||||
"skylakex",
|
"skylakex",
|
||||||
"dhyana",
|
"dhyana",
|
||||||
"cooperlake"
|
"cooperlake",
|
||||||
|
"sapphirerapids",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
|
|
|
@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE;
|
||||||
#else
|
#else
|
||||||
#define gotoblas_COOPERLAKE gotoblas_PRESCOTT
|
#define gotoblas_COOPERLAKE gotoblas_PRESCOTT
|
||||||
#endif
|
#endif
|
||||||
|
#ifdef DYN_SAPPHIRERAPIDS
|
||||||
|
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
|
||||||
|
#elif defined(DYN_SKYLAKEX)
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX
|
||||||
|
#elif defined(DYN_HASWELL)
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
|
||||||
|
#elif defined(DYN_SANDYBRIDGE)
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
|
||||||
|
#elif defined(DYN_NEHALEM)
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
|
||||||
|
#else
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#else // not DYNAMIC_LIST
|
#else // not DYNAMIC_LIST
|
||||||
|
@ -268,9 +281,11 @@ extern gotoblas_t gotoblas_ZEN;
|
||||||
#ifndef NO_AVX512
|
#ifndef NO_AVX512
|
||||||
extern gotoblas_t gotoblas_SKYLAKEX;
|
extern gotoblas_t gotoblas_SKYLAKEX;
|
||||||
extern gotoblas_t gotoblas_COOPERLAKE;
|
extern gotoblas_t gotoblas_COOPERLAKE;
|
||||||
|
extern gotoblas_t gotoblas_SAPPHIRERAPIDS;
|
||||||
#else
|
#else
|
||||||
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
#define gotoblas_SKYLAKEX gotoblas_HASWELL
|
||||||
#define gotoblas_COOPERLAKE gotoblas_HASWELL
|
#define gotoblas_COOPERLAKE gotoblas_HASWELL
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
#else
|
#else
|
||||||
|
@ -279,6 +294,7 @@ extern gotoblas_t gotoblas_COOPERLAKE;
|
||||||
#define gotoblas_HASWELL gotoblas_NEHALEM
|
#define gotoblas_HASWELL gotoblas_NEHALEM
|
||||||
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
#define gotoblas_SKYLAKEX gotoblas_NEHALEM
|
||||||
#define gotoblas_COOPERLAKE gotoblas_NEHALEM
|
#define gotoblas_COOPERLAKE gotoblas_NEHALEM
|
||||||
|
#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM
|
||||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||||
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||||
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
#define gotoblas_STEAMROLLER gotoblas_BARCELONA
|
||||||
|
@ -378,6 +394,31 @@ int support_avx512_bf16(){
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#define BIT_AMX_TILE 0x01000000
|
||||||
|
#define BIT_AMX_BF16 0x00400000
|
||||||
|
#define BIT_AMX_ENBD 0x00060000
|
||||||
|
|
||||||
|
int support_amx_bf16() {
|
||||||
|
#if !defined(NO_AVX) && !defined(NO_AVX512)
|
||||||
|
int eax, ebx, ecx, edx;
|
||||||
|
int ret=0;
|
||||||
|
|
||||||
|
if (!support_avx512())
|
||||||
|
return 0;
|
||||||
|
// CPUID.7.0:EDX indicates AMX support
|
||||||
|
cpuid_count(7, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) {
|
||||||
|
// CPUID.D.0:EAX[17:18] indicates AMX enabled
|
||||||
|
cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx);
|
||||||
|
if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD)
|
||||||
|
ret = 1;
|
||||||
|
}
|
||||||
|
return ret;
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
extern void openblas_warning(int verbose, const char * msg);
|
extern void openblas_warning(int verbose, const char * msg);
|
||||||
#define FALLBACK_VERBOSE 1
|
#define FALLBACK_VERBOSE 1
|
||||||
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
|
||||||
|
@ -689,6 +730,8 @@ static gotoblas_t *get_coretype(void){
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (model == 15){ // Sapphire Rapids
|
if (model == 15){ // Sapphire Rapids
|
||||||
|
if(support_amx_bf16())
|
||||||
|
return &gotoblas_SAPPHIRERAPIDS;
|
||||||
if(support_avx512_bf16())
|
if(support_avx512_bf16())
|
||||||
return &gotoblas_COOPERLAKE;
|
return &gotoblas_COOPERLAKE;
|
||||||
if (support_avx512())
|
if (support_avx512())
|
||||||
|
@ -941,7 +984,8 @@ static char *corename[] = {
|
||||||
"Excavator",
|
"Excavator",
|
||||||
"Zen",
|
"Zen",
|
||||||
"SkylakeX",
|
"SkylakeX",
|
||||||
"Cooperlake"
|
"Cooperlake",
|
||||||
|
"SapphireRapids"
|
||||||
};
|
};
|
||||||
|
|
||||||
char *gotoblas_corename(void) {
|
char *gotoblas_corename(void) {
|
||||||
|
@ -1006,6 +1050,7 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
if (gotoblas == &gotoblas_ZEN) return corename[23];
|
||||||
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
|
||||||
if (gotoblas == &gotoblas_COOPERLAKE) return corename[25];
|
if (gotoblas == &gotoblas_COOPERLAKE) return corename[25];
|
||||||
|
if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26];
|
||||||
return corename[0];
|
return corename[0];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
||||||
|
#define XFEATURE_XTILEDATA 18
|
||||||
|
#define ARCH_REQ_XCOMP_PERM 0x1023
|
||||||
|
static int openblas_amxtile_permission = 0;
|
||||||
|
static int init_amxtile_permission() {
|
||||||
|
long status =
|
||||||
|
syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA);
|
||||||
|
if (status != 0) {
|
||||||
|
fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, "
|
||||||
|
"Intel Sapphier Rapids), skip sbgemm calculation\n");
|
||||||
|
return -1;
|
||||||
|
}
|
||||||
|
openblas_amxtile_permission = 1;
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifndef CBLAS
|
#ifndef CBLAS
|
||||||
|
|
||||||
void NAME(char *TRANSA, char *TRANSB,
|
void NAME(char *TRANSA, char *TRANSB,
|
||||||
|
@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
||||||
|
#if defined(DYNAMIC_ARCH)
|
||||||
|
if (gotoblas->need_amxtile_permission &&
|
||||||
|
openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS)
|
||||||
|
if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
#endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16)
|
||||||
|
|
||||||
if ((args.m == 0) || (args.n == 0)) return;
|
if ((args.m == 0) || (args.n == 0)) return;
|
||||||
|
|
||||||
#if 0
|
#if 0
|
||||||
|
|
|
@ -33,7 +33,7 @@ endif
|
||||||
ifdef TARGET_CORE
|
ifdef TARGET_CORE
|
||||||
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
ifeq ($(TARGET_CORE), SAPPHIRERAPIDS)
|
||||||
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
|
||||||
ifeq ($(GCCVERSIONGTEQ10), 1)
|
ifeq ($(GCCVERSIONGTEQ11), 1)
|
||||||
override CFLAGS += -march=sapphirerapids
|
override CFLAGS += -march=sapphirerapids
|
||||||
else
|
else
|
||||||
override CFLAGS += -march=skylake-avx512 -mavx512f
|
override CFLAGS += -march=skylake-avx512 -mavx512f
|
||||||
|
|
|
@ -66,6 +66,7 @@ gotoblas_t TABLE_NAME = {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
SBGEMM_ALIGN_K,
|
SBGEMM_ALIGN_K,
|
||||||
|
0, // need_amxtile_permission
|
||||||
|
|
||||||
sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
|
sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
|
||||||
|
|
||||||
|
@ -1809,6 +1810,12 @@ static void init_parameter(void) {
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef SAPPHIRERAPIDS
|
||||||
|
#if (BUILD_BFLOAT16 == 1)
|
||||||
|
TABLE_NAME.need_amxtile_permission = 1;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#if BUILD_COMPLEX==1
|
#if BUILD_COMPLEX==1
|
||||||
#ifdef CGEMM3M_DEFAULT_P
|
#ifdef CGEMM3M_DEFAULT_P
|
||||||
TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
|
TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
|
||||||
|
|
|
@ -1,6 +1,14 @@
|
||||||
include $(KERNELDIR)/KERNEL.COOPERLAKE
|
include $(KERNELDIR)/KERNEL.COOPERLAKE
|
||||||
|
|
||||||
SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c
|
SBGEMM_SMALL_M_PERMIT =
|
||||||
|
SBGEMM_SMALL_K_NN =
|
||||||
|
SBGEMM_SMALL_K_B0_NN =
|
||||||
|
SBGEMM_SMALL_K_NT =
|
||||||
|
SBGEMM_SMALL_K_B0_NT =
|
||||||
|
SBGEMM_SMALL_K_TN =
|
||||||
|
SBGEMM_SMALL_K_B0_TN =
|
||||||
|
SBGEMM_SMALL_K_TT =
|
||||||
|
SBGEMM_SMALL_K_B0_TT =
|
||||||
|
|
||||||
SBGEMM_BETA = sgemm_beta_skylakex.c
|
SBGEMM_BETA = sgemm_beta_skylakex.c
|
||||||
SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c
|
SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c
|
||||||
|
|
|
@ -97,33 +97,32 @@ typedef struct {
|
||||||
#define T_C10 6
|
#define T_C10 6
|
||||||
#define T_C11 7
|
#define T_C11 7
|
||||||
|
|
||||||
// FIXME: gcc11 seem have problem in tile load/store address calc,
|
|
||||||
// need to multiply with element size (2 or 4) here.
|
|
||||||
#define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2)
|
#define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2)
|
||||||
#define LOAD_A_TAIL(M, N) {\
|
#define LOAD_A_TAIL(M, N) {\
|
||||||
__m256i ymm = _mm256_loadu_epi16(ptr_a##M); \
|
__m256i ymm = _mm256_loadu_epi16(ptr_a##M); \
|
||||||
__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
|
__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
|
||||||
_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
|
_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
|
||||||
_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
|
_tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \
|
||||||
}
|
}
|
||||||
#define MASK_LOAD_A_TAIL(M, N) {\
|
#define MASK_LOAD_A_TAIL(M, N) {\
|
||||||
__m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \
|
__m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \
|
||||||
__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
|
__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
|
||||||
_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
|
_mm512_storeu_epi16(tail_a + 16 * M, zmm); \
|
||||||
_tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \
|
_tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \
|
||||||
}
|
}
|
||||||
#define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2)
|
#define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2)
|
||||||
#define LOAD_B_TAIL(M, N) {\
|
#define LOAD_B_TAIL(M, N) {\
|
||||||
__m256i ymm = _mm256_loadu_epi16(ptr_b##N); \
|
__m256i ymm = _mm256_loadu_epi16(ptr_b##N); \
|
||||||
__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
|
__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
|
||||||
_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
|
_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
|
||||||
_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
|
_tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \
|
||||||
}
|
}
|
||||||
#define MASK_LOAD_B_TAIL(M, N) {\
|
#define MASK_LOAD_B_TAIL(M, N) {\
|
||||||
__m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \
|
__m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \
|
||||||
__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
|
__m512i zmm = _mm512_cvtepu16_epi32(ymm); \
|
||||||
_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
|
_mm512_storeu_epi16(tail_b + 16 * N, zmm); \
|
||||||
_tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \
|
_tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \
|
||||||
}
|
}
|
||||||
|
|
||||||
#define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N)
|
#define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N)
|
||||||
|
|
Loading…
Reference in New Issue