diff --git a/Makefile.system b/Makefile.system index 7dc17c486..7d26eccc3 100644 --- a/Makefile.system +++ b/Makefile.system @@ -645,7 +645,7 @@ DYNAMIC_CORE += HASWELL ZEN endif ifneq ($(NO_AVX512), 1) ifneq ($(NO_AVX2), 1) -DYNAMIC_CORE += SKYLAKEX COOPERLAKE +DYNAMIC_CORE += SKYLAKEX COOPERLAKE SAPPHIRERAPIDS endif endif endif diff --git a/cmake/arch.cmake b/cmake/arch.cmake index 8521f3988..f70019800 100644 --- a/cmake/arch.cmake +++ b/cmake/arch.cmake @@ -82,7 +82,7 @@ if (DYNAMIC_ARCH) set(DYNAMIC_CORE ${DYNAMIC_CORE} HASWELL ZEN) endif () if (NOT NO_AVX512) - set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE) + set(DYNAMIC_CORE ${DYNAMIC_CORE} SKYLAKEX COOPERLAKE SAPPHIRERAPIDS) string(REGEX REPLACE "-march=native" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") endif () if (DYNAMIC_LIST) diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 56c1cb060..5e8ba866b 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -87,6 +87,15 @@ macro(ParseMakefileVars MAKEFILE_IN) #message(STATUS "skipping ${makefile_line}") continue () endif () + + # Example 1: SBGEMM_SMALL_M_PERMIT = + # Unset the variable + string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*$" line_match "${makefile_line}") + if (NOT "${line_match}" STREQUAL "") + set(var_name ${CMAKE_MATCH_1}) + unset(${var_name}) + endif() + string(REGEX MATCH "([0-9_a-zA-Z]+)[ \t]*=[ \t]*(.+)$" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") #message(STATUS "match on ${line_match}") diff --git a/common_param.h b/common_param.h index 8b39ca2fc..c082d248e 100644 --- a/common_param.h +++ b/common_param.h @@ -53,6 +53,7 @@ typedef struct { int sbgemm_p, sbgemm_q, sbgemm_r; int sbgemm_unroll_m, sbgemm_unroll_n, sbgemm_unroll_mn; int sbgemm_align_k; + int need_amxtile_permission; // 0 default, 1 for device support amx. void (*sbstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG); void (*sbdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG); diff --git a/cpuid_x86.c b/cpuid_x86.c index 69cbba90e..c2486e380 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1479,6 +1479,8 @@ int get_cpuname(void){ else return CPUTYPE_NEHALEM; case 15: // Sapphire Rapids + if(support_amx_bf16()) + return CPUTYPE_SAPPHIRERAPIDS; if(support_avx512_bf16()) return CPUTYPE_COOPERLAKE; if(support_avx512()) @@ -1845,7 +1847,8 @@ static char *cpuname[] = { "ZEN", "SKYLAKEX", "DHYANA", - "COOPERLAKE" + "COOPERLAKE", + "SAPPHIRERAPIDS", }; static char *lowercpuname[] = { @@ -1902,7 +1905,8 @@ static char *lowercpuname[] = { "zen", "skylakex", "dhyana", - "cooperlake" + "cooperlake", + "sapphirerapids", }; static char *corename[] = { diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index f61930983..8e0f53f74 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -220,6 +220,19 @@ extern gotoblas_t gotoblas_COOPERLAKE; #else #define gotoblas_COOPERLAKE gotoblas_PRESCOTT #endif +#ifdef DYN_SAPPHIRERAPIDS +extern gotoblas_t gotoblas_SAPPHIRERAPIDS; +#elif defined(DYN_SKYLAKEX) +#define gotoblas_SAPPHIRERAPIDS gotoblas_SKYLAKEX +#elif defined(DYN_HASWELL) +#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL +#elif defined(DYN_SANDYBRIDGE) +#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE +#elif defined(DYN_NEHALEM) +#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM +#else +#define gotoblas_SAPPHIRERAPIDS gotoblas_PRESCOTT +#endif #else // not DYNAMIC_LIST @@ -268,9 +281,11 @@ extern gotoblas_t gotoblas_ZEN; #ifndef NO_AVX512 extern gotoblas_t gotoblas_SKYLAKEX; extern gotoblas_t gotoblas_COOPERLAKE; +extern gotoblas_t gotoblas_SAPPHIRERAPIDS; #else #define gotoblas_SKYLAKEX gotoblas_HASWELL #define gotoblas_COOPERLAKE gotoblas_HASWELL +#define gotoblas_SAPPHIRERAPIDS gotoblas_HASWELL #endif #endif #else @@ -279,6 +294,7 @@ extern gotoblas_t gotoblas_COOPERLAKE; #define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_SKYLAKEX gotoblas_NEHALEM #define gotoblas_COOPERLAKE gotoblas_NEHALEM +#define gotoblas_SAPPHIRERAPIDS gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA @@ -378,6 +394,31 @@ int support_avx512_bf16(){ #endif } +#define BIT_AMX_TILE 0x01000000 +#define BIT_AMX_BF16 0x00400000 +#define BIT_AMX_ENBD 0x00060000 + +int support_amx_bf16() { +#if !defined(NO_AVX) && !defined(NO_AVX512) + int eax, ebx, ecx, edx; + int ret=0; + + if (!support_avx512()) + return 0; + // CPUID.7.0:EDX indicates AMX support + cpuid_count(7, 0, &eax, &ebx, &ecx, &edx); + if ((edx & BIT_AMX_TILE) && (edx & BIT_AMX_BF16)) { + // CPUID.D.0:EAX[17:18] indicates AMX enabled + cpuid_count(0xd, 0, &eax, &ebx, &ecx, &edx); + if ((eax & BIT_AMX_ENBD) == BIT_AMX_ENBD) + ret = 1; + } + return ret; +#else + return 0; +#endif +} + extern void openblas_warning(int verbose, const char * msg); #define FALLBACK_VERBOSE 1 #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" @@ -689,6 +730,8 @@ static gotoblas_t *get_coretype(void){ } } if (model == 15){ // Sapphire Rapids + if(support_amx_bf16()) + return &gotoblas_SAPPHIRERAPIDS; if(support_avx512_bf16()) return &gotoblas_COOPERLAKE; if (support_avx512()) @@ -941,7 +984,8 @@ static char *corename[] = { "Excavator", "Zen", "SkylakeX", - "Cooperlake" + "Cooperlake", + "SapphireRapids" }; char *gotoblas_corename(void) { @@ -1006,6 +1050,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_ZEN) return corename[23]; if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; if (gotoblas == &gotoblas_COOPERLAKE) return corename[25]; + if (gotoblas == &gotoblas_SAPPHIRERAPIDS) return corename[26]; return corename[0]; } diff --git a/interface/gemm.c b/interface/gemm.c index 71cc77a1b..4778b641b 100644 --- a/interface/gemm.c +++ b/interface/gemm.c @@ -154,6 +154,23 @@ static size_t zgemm_small_kernel_b0[] = { #endif #endif +#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) +#define XFEATURE_XTILEDATA 18 +#define ARCH_REQ_XCOMP_PERM 0x1023 +static int openblas_amxtile_permission = 0; +static int init_amxtile_permission() { + long status = + syscall(SYS_arch_prctl, ARCH_REQ_XCOMP_PERM, XFEATURE_XTILEDATA); + if (status != 0) { + fprintf(stderr, "XTILEDATA permission not granted in your device(Linux, " + "Intel Sapphier Rapids), skip sbgemm calculation\n"); + return -1; + } + openblas_amxtile_permission = 1; + return 0; +} +#endif + #ifndef CBLAS void NAME(char *TRANSA, char *TRANSB, @@ -455,6 +472,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS #endif +#if defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) +#if defined(DYNAMIC_ARCH) + if (gotoblas->need_amxtile_permission && + openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { + return; + } +#endif +#if !defined(DYNAMIC_ARCH) && defined(SAPPHIRERAPIDS) + if (openblas_amxtile_permission == 0 && init_amxtile_permission() == -1) { + return; + } +#endif +#endif // defined(__linux__) && defined(__x86_64__) && defined(BFLOAT16) + if ((args.m == 0) || (args.n == 0)) return; #if 0 diff --git a/kernel/Makefile b/kernel/Makefile index 977886044..d426a1bdb 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -33,7 +33,7 @@ endif ifdef TARGET_CORE ifeq ($(TARGET_CORE), SAPPHIRERAPIDS) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) - ifeq ($(GCCVERSIONGTEQ10), 1) + ifeq ($(GCCVERSIONGTEQ11), 1) override CFLAGS += -march=sapphirerapids else override CFLAGS += -march=skylake-avx512 -mavx512f diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 7832c0a87..4c361f155 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -66,6 +66,7 @@ gotoblas_t TABLE_NAME = { #endif SBGEMM_ALIGN_K, + 0, // need_amxtile_permission sbstobf16_kTS, sbdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS, @@ -1809,6 +1810,12 @@ static void init_parameter(void) { #endif +#ifdef SAPPHIRERAPIDS +#if (BUILD_BFLOAT16 == 1) + TABLE_NAME.need_amxtile_permission = 1; +#endif +#endif + #if BUILD_COMPLEX==1 #ifdef CGEMM3M_DEFAULT_P TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; diff --git a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS index 88f574668..3a832e917 100644 --- a/kernel/x86_64/KERNEL.SAPPHIRERAPIDS +++ b/kernel/x86_64/KERNEL.SAPPHIRERAPIDS @@ -1,6 +1,14 @@ include $(KERNELDIR)/KERNEL.COOPERLAKE -SBGEMM_SMALL_M_PERMIT = sbgemm_small_kernel_permit_spr.c +SBGEMM_SMALL_M_PERMIT = +SBGEMM_SMALL_K_NN = +SBGEMM_SMALL_K_B0_NN = +SBGEMM_SMALL_K_NT = +SBGEMM_SMALL_K_B0_NT = +SBGEMM_SMALL_K_TN = +SBGEMM_SMALL_K_B0_TN = +SBGEMM_SMALL_K_TT = +SBGEMM_SMALL_K_B0_TT = SBGEMM_BETA = sgemm_beta_skylakex.c SBGEMMKERNEL = sbgemm_kernel_16x16_spr.c diff --git a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c index 90e0a32c7..5ee3c8532 100644 --- a/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c +++ b/kernel/x86_64/sbgemm_kernel_16x16_spr_tmpl.c @@ -97,33 +97,32 @@ typedef struct { #define T_C10 6 #define T_C11 7 -// FIXME: gcc11 seem have problem in tile load/store address calc, -// need to multiply with element size (2 or 4) here. + #define LOAD_A(M, N) _tile_loadd(T_A##M, ptr_a##M, lda * 2) #define LOAD_A_TAIL(M, N) {\ __m256i ymm = _mm256_loadu_epi16(ptr_a##M); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ - _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ + _tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \ } #define MASK_LOAD_A_TAIL(M, N) {\ __m256i ymm = _mm256_maskz_loadu_epi16(amask, ptr_a##M); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_a + 16 * M, zmm); \ - _tile_loadd(T_A##M, tail_a + 16 * 2 * M, 2 * 2); \ + _tile_loadd(T_A##M, tail_a + 16 * M, 2 * 2); \ } #define LOAD_B(M, N) _tile_loadd(T_B##N, ptr_b##N, ldb * 2) #define LOAD_B_TAIL(M, N) {\ __m256i ymm = _mm256_loadu_epi16(ptr_b##N); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ - _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ + _tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \ } #define MASK_LOAD_B_TAIL(M, N) {\ __m256i ymm = _mm256_maskz_loadu_epi16(bmask, ptr_b##N); \ __m512i zmm = _mm512_cvtepu16_epi32(ymm); \ _mm512_storeu_epi16(tail_b + 16 * N, zmm); \ - _tile_loadd(T_B##N, tail_b + 16 * 2 * N, 2 * 2); \ + _tile_loadd(T_B##N, tail_b + 16 * N, 2 * 2); \ } #define MATMUL(M, N) _tile_dpbf16ps(T_C##M##N, T_A##M, T_B##N)