diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 6d18047fb..fd4ab4bec 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -183,4 +183,7 @@ In chronological order: * Rajalakshmi Srinivasaraghavan * [2020-04-15] Half-precision GEMM for bfloat16 - + +* Marius Hillenbrand + * [2020-05-12] Revise dynamic architecture detection for IBM z + * [2020-05-12] Add new sgemm and strmm kernel for IBM z14 diff --git a/Makefile.system b/Makefile.system index 023546009..98d9ae313 100644 --- a/Makefile.system +++ b/Makefile.system @@ -563,8 +563,27 @@ DYNAMIC_CORE += EMAG8180 endif ifeq ($(ARCH), zarch) -DYNAMIC_CORE = Z13 +DYNAMIC_CORE = ZARCH_GENERIC + +# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer +GCC_GE_52 := $(subst 0,,$(shell expr `$(CC) -dumpversion` \>= "5.2")) + +ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release) +RHEL_WITH_Z13 := $(subst 0,,$(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3")) +endif + +ifeq ($(or $(GCC_GE_52),$(RHEL_WITH_Z13)), 1) +DYNAMIC_CORE += Z13 +else +$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x) +endif + +GCC_MAJOR_GE_7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) +ifeq ($(GCC_MAJOR_GE_7), 1) DYNAMIC_CORE += Z14 +else +$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x) +endif endif ifeq ($(ARCH), power) @@ -855,7 +874,7 @@ ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -i8 endif endif -FCOMMON_OPT += -recursive +FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens ifeq ($(USE_OPENMP), 1) FCOMMON_OPT += -fopenmp endif diff --git a/Makefile.zarch b/Makefile.zarch index 47ea1eb71..be1e34f6d 100644 --- a/Makefile.zarch +++ b/Makefile.zarch @@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector endif ifeq ($(CORE), Z14) -CCOMMON_OPT += -march=z14 -mzvector +CCOMMON_OPT += -march=z14 -mzvector -O3 FCOMMON_OPT += -march=z14 -mzvector endif diff --git a/benchmark/Makefile b/benchmark/Makefile index 90d903ad7..53f422be4 100644 --- a/benchmark/Makefile +++ b/benchmark/Makefile @@ -49,6 +49,12 @@ else GOTO_LAPACK_TARGETS= endif +ifeq ($(BUILD_HALF),1) +GOTO_HALF_TARGETS=shgemm.goto +else +GOTO_HALF_TARGETS= +endif + ifeq ($(OSNAME), WINNT) goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ @@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \ sgetri.goto dgetri.goto cgetri.goto zgetri.goto \ spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \ ssymm.goto dsymm.goto csymm.goto zsymm.goto \ - saxpby.goto daxpby.goto caxpby.goto zaxpby.goto + saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \ samin.goto damin.goto camin.goto zamin.goto \ smin.goto dmin.goto \ saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \ - snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) + snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS) acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \ scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \ @@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX) -$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) ##################################### Sgemm #################################################### +ifeq ($(BUILD_HALF),1) +shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME) + $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm +endif + sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME) $(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm @@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c zcholesky.$(SUFFIX) : cholesky.c $(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^ +ifeq ($(BUILD_HALF),1) +shgemm.$(SUFFIX) : gemm.c + $(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^ +endif + sgemm.$(SUFFIX) : gemm.c $(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^ diff --git a/benchmark/gemm.c b/benchmark/gemm.c index dd016a7c3..d2235330b 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef DOUBLE #define GEMM BLASFUNC(dgemm) +#elif defined(HALF) +#define GEMM BLASFUNC(shgemm) #else #define GEMM BLASFUNC(sgemm) #endif @@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){ int main(int argc, char *argv[]){ - FLOAT *a, *b, *c; + IFLOAT *a, *b; + FLOAT *c; FLOAT alpha[] = {1.0, 0.0}; FLOAT beta [] = {0.0, 0.0}; char transa = 'N'; @@ -184,10 +187,10 @@ int main(int argc, char *argv[]){ k = to; } - if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) { + if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n");exit(1); } - if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) { + if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) { fprintf(stderr,"Out of Memory!!\n");exit(1); } if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) { @@ -199,10 +202,10 @@ int main(int argc, char *argv[]){ #endif for (i = 0; i < m * k * COMPSIZE; i++) { - a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; } for (i = 0; i < k * n * COMPSIZE; i++) { - b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; + b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5; } for (i = 0; i < m * n * COMPSIZE; i++) { c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; diff --git a/cmake/utils.cmake b/cmake/utils.cmake index 7a125ec55..1c21e776e 100644 --- a/cmake/utils.cmake +++ b/cmake/utils.cmake @@ -43,7 +43,8 @@ macro(ParseMakefileVars MAKEFILE_IN) if (NOT "${line_match}" STREQUAL "") #message(STATUS "match on ${line_match}") set(var_name ${CMAKE_MATCH_1}) - set(var_value ${CMAKE_MATCH_2}) +# set(var_value ${CMAKE_MATCH_2}) + string(STRIP ${CMAKE_MATCH_2} var_value) # check for Makefile variables in the string, e.g. $(TSUFFIX) string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value}) foreach (make_var ${make_var_matches}) @@ -63,7 +64,7 @@ macro(ParseMakefileVars MAKEFILE_IN) string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}") if (NOT "${line_match}" STREQUAL "") # message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}") - if (${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) + if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2}) # message (STATUS "condition is true") set (IfElse 1) else () diff --git a/driver/others/dynamic_zarch.c b/driver/others/dynamic_zarch.c index 90d3051b1..403b34111 100644 --- a/driver/others/dynamic_zarch.c +++ b/driver/others/dynamic_zarch.c @@ -1,12 +1,58 @@ - #include "common.h" +#include +// Gate kernels for z13 and z14 on gcc version +#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \ + /* RHEL 7 since 7.3: */ \ + (__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \ + __GNUC_RH_RELEASE__ >= 11) +#define HAVE_Z13_SUPPORT +#endif + +#if __GNUC__ >= 7 +#define HAVE_Z14_SUPPORT +#endif + +// Guard the use of getauxval() on glibc version >= 2.16 +#ifdef __GLIBC__ +#include +#if __GLIBC_PREREQ(2, 16) +#include +#define HAVE_GETAUXVAL 1 + +static unsigned long get_hwcap(void) +{ + unsigned long hwcap = getauxval(AT_HWCAP); + char *maskenv; + + // honor requests for not using specific CPU features in LD_HWCAP_MASK + maskenv = getenv("LD_HWCAP_MASK"); + if (maskenv) + hwcap &= strtoul(maskenv, NULL, 0); + + return hwcap; + // note that a missing auxval is interpreted as no capabilities + // available, which is safe. +} + +#else // __GLIBC_PREREQ(2, 16) +#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16" + +static unsigned long get_hwcap(void) { + // treat missing support for getauxval() as no capabilities available, + // which is safe. + return 0; +} +#endif // __GLIBC_PREREQ(2, 16) +#endif // __GLIBC + +extern gotoblas_t gotoblas_ZARCH_GENERIC; +#ifdef HAVE_Z13_SUPPORT extern gotoblas_t gotoblas_Z13; +#endif +#ifdef HAVE_Z14_SUPPORT extern gotoblas_t gotoblas_Z14; -//extern gotoblas_t gotoblas_Z15; -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -//extern gotoblas_t gotoblas_Z14; -//#endif +#endif #define NUM_CORETYPES 4 @@ -16,47 +62,50 @@ static char* corename[] = { "unknown", "Z13", "Z14", -// "Z15", "ZARCH_GENERIC", }; char* gotoblas_corename(void) { +#ifdef HAVE_Z13_SUPPORT if (gotoblas == &gotoblas_Z13) return corename[1]; +#endif +#ifdef HAVE_Z14_SUPPORT if (gotoblas == &gotoblas_Z14) return corename[2]; -// if (gotoblas == &gotoblas_Z15) return corename[3]; -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -// if (gotoblas == &gotoblas_POWER9) return corename[3]; -//#endif - return corename[0]; // try generic? +#endif + if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3]; + + return corename[0]; } -// __builtin_cpu_is is not supported by zarch +/** + * Detect the fitting set of kernels by retrieving the CPU features supported by + * OS from the auxiliary value AT_HWCAP and choosing the set of kernels + * ("coretype") that exploits most of the features and can be compiled with the + * available gcc version. + * Note that we cannot use vector registers on a z13 or newer unless supported + * by the OS kernel (which needs to handle them properly during context switch). + */ static gotoblas_t* get_coretype(void) { - FILE* infile; - char buffer[512], * p; - p = (char*)NULL; - infile = fopen("/proc/sysinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)) { - if (!strncmp("Type", buffer, 4)) { - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); + unsigned long hwcap __attribute__((unused)) = get_hwcap(); + + // z14 and z15 systems: exploit Vector Facility (SIMD) and + // Vector-Enhancements Facility 1 (float SIMD instructions), if present. +#ifdef HAVE_Z14_SUPPORT + if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE)) + return &gotoblas_Z14; #endif - break; - } - } - fclose(infile); + // z13: Vector Facility (SIMD for double) +#ifdef HAVE_Z13_SUPPORT + if (hwcap & HWCAP_S390_VX) + return &gotoblas_Z13; +#endif - if (strstr(p, "2964")) return &gotoblas_Z13; - if (strstr(p, "2965")) return &gotoblas_Z13; - if (strstr(p, "3906")) return &gotoblas_Z14; - if (strstr(p, "3907")) return &gotoblas_Z14; - if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14 - if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14 - - return NULL; // should be ZARCH_GENERIC + // fallback in case of missing compiler support, systems before z13, or + // when the OS does not advertise support for the Vector Facility (e.g., + // missing support in the OS kernel) + return &gotoblas_ZARCH_GENERIC; } static gotoblas_t* force_coretype(char* coretype) { @@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) { switch (found) { +#ifdef HAVE_Z13_SUPPORT case 1: return (&gotoblas_Z13); +#endif +#ifdef HAVE_Z14_SUPPORT case 2: return (&gotoblas_Z14); -// case 3: return (&gotoblas_Z15); -//#if (!defined C_GCC) || (GCC_VERSION >= 60000) -// case 3: return (&gotoblas_POWER9); -//#endif +#endif + case 3: return (&gotoblas_ZARCH_GENERIC); default: return NULL; } snprintf(message, 128, "Core not found: %s\n", coretype); @@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) { if (gotoblas == NULL) { - snprintf(coremsg, 128, "Falling back to Z14 core\n"); + snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n"); openblas_warning(1, coremsg); - gotoblas = &gotoblas_Z14; + gotoblas = &gotoblas_ZARCH_GENERIC; } if (gotoblas && gotoblas->init) { diff --git a/kernel/x86_64/KERNEL.SKYLAKEX b/kernel/x86_64/KERNEL.SKYLAKEX index 65f031d03..448aee074 100644 --- a/kernel/x86_64/KERNEL.SKYLAKEX +++ b/kernel/x86_64/KERNEL.SKYLAKEX @@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c + +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c diff --git a/kernel/zarch/KERNEL.Z14 b/kernel/zarch/KERNEL.Z14 index f6e3bec23..96e6745fd 100644 --- a/kernel/zarch/KERNEL.Z14 +++ b/kernel/zarch/KERNEL.Z14 @@ -86,23 +86,23 @@ DGEMVTKERNEL = dgemv_t_4.c CGEMVTKERNEL = cgemv_t_4.c ZGEMVTKERNEL = zgemv_t_4.c -STRMMKERNEL = strmm8x4V.S +STRMMKERNEL = gemm_vec.c DTRMMKERNEL = trmm8x4V.S CTRMMKERNEL = ctrmm4x4V.S ZTRMMKERNEL = ztrmm4x4V.S -SGEMMKERNEL = strmm8x4V.S -SGEMMINCOPY = ../generic/gemm_ncopy_8.c -SGEMMITCOPY = ../generic/gemm_tcopy_8.c -SGEMMONCOPY = ../generic/gemm_ncopy_4.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMKERNEL = gemm_vec.c +ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N)) +SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c +SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +endif +SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c +SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) - - DGEMMKERNEL = gemm8x4V.S DGEMMINCOPY = ../generic/gemm_ncopy_8.c DGEMMITCOPY = ../generic/gemm_tcopy_8.c @@ -145,7 +145,3 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/zarch/gemm_vec.c b/kernel/zarch/gemm_vec.c new file mode 100644 index 000000000..2d4457f06 --- /dev/null +++ b/kernel/zarch/gemm_vec.c @@ -0,0 +1,472 @@ +/* + * Copyright (c) IBM Corporation 2020. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * 3. Neither the name of the OpenBLAS project nor the names of + * its contributors may be used to endorse or promote products + * derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE + * USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "common.h" +#include + +#include +#include +#include + +#ifdef COMPLEX +#error "Handling for complex numbers is not supported in this kernel" +#endif + +#ifdef DOUBLE +#define UNROLL_M DGEMM_DEFAULT_UNROLL_M +#define UNROLL_N DGEMM_DEFAULT_UNROLL_N +#else +#define UNROLL_M SGEMM_DEFAULT_UNROLL_M +#define UNROLL_N SGEMM_DEFAULT_UNROLL_N +#endif + +static const size_t unroll_m = UNROLL_M; +static const size_t unroll_n = UNROLL_N; + +/* Handling of triangular matrices */ +#ifdef TRMMKERNEL +static const bool trmm = true; +static const bool left = +#ifdef LEFT + true; +#else + false; +#endif + +static const bool backwards = +#if defined(LEFT) != defined(TRANSA) + true; +#else + false; +#endif + +#else +static const bool trmm = false; +static const bool left = false; +static const bool backwards = false; +#endif /* TRMMKERNEL */ + +/* + * Background: + * + * The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication + * problem by splitting all matrices into partitions multiple times, so that the + * submatrices fit into the L1 or L2 caches. As a result, each multiplication of + * submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies + * and rearranges the submatrices to enable contiguous memory accesses to + * improve locality in both caches and TLBs. + * + * At the heart of the algorithm is this kernel, which multiplies, a "Block + * matrix" A (small dimensions) with a "Panel matrix" B (number of rows is + * small) and adds the result into a "Panel matrix" C; GotoBLAS calls this + * operation GEBP. This kernel further partitions GEBP twice, such that (1) + * submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a + * block of C fits into the registers, while multiplying panels from A and B + * streamed from the L2 and L1 cache, respectively (GEBP_block). + * + * + * Algorithm GEBP(A, B, C, m, n, k, alpha): + * + * The problem is calculating C += alpha * (A * B) + * C is an m x n matrix, A is an m x k matrix, B is an k x n matrix. + * + * - C is in column-major-order, with an offset of ldc to the element in the + * next column (same row). + * - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column + * contiguously while walking along rows. + * - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row + * contiguously. + * If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or + * SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2 + * dimensions (e.g., 5 remaining columns would be in a block-of-4 and a + * block-of-1). + * + * Note that packing A and B into that form is taken care of by the caller in + * driver/level3/level3.c (actually done by "copy kernels"). + * + * Steps: + * - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j. + * Now, B_j should fit into the L1 cache. + * - For each partition, calculate C_j += alpha * (A * B_j) by + * (1) Calculate C_aux := A * B_j (see below) + * (2) unpack C_j = C_j + alpha * C_aux + * + * + * Algorithm for Calculating C_aux: + * + * - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows, + * such that the m_r x n_r-submatrix of C_aux can be held in registers. Each + * submatrix of C_aux can be calculated independently, and the registers are + * added back into C_j. + * + * - For each row-block of C_aux: + * (uses a row block of A and full B_j) + * - stream over all columns of A, multiply with elements from B and + * accumulate in registers. (use different inner-kernels to exploit + * vectorization for varying block sizes) + * - add alpha * row block of C_aux back into C_j. + * + * Note that there are additional mechanics for handling triangular matrices, + * calculating B := alpha (A * B) where either of the matrices A or B can be + * triangular. In case of A, the macro "LEFT" is defined. In addition, A can + * optionally be transposed. + * The code effectively skips an "offset" number of columns in A and rows of B + * in each block, to save unnecessary work by exploiting the triangular nature. + * To handle all cases, the code discerns (1) a "left" mode when A is triangular + * and (2) "forward" / "backwards" modes where only the first "offset" + * columns/rows of A/B are used or where the first "offset" columns/rows are + * skipped, respectively. + * + * Reference: + * + * The summary above is based on staring at various kernel implementations and: + * K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix + * Multiplication, in ACM Transactions of Mathematical Software, Vol. 34, No. + * 3, May 2008. + */ + +#define VLEN_BYTES 16 +#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT)) + +typedef FLOAT vector_float __attribute__ ((vector_size (16))); + +/** + * Load a vector into register, and hint on 8-byte alignment to improve + * performance. gcc-9 and newer will create these hints by itself. For older + * compiler versions, use inline assembly to explicitly express the hint. + * Provide explicit hex encoding to cater for binutils versions that do not know + * about vector-load with alignment hints yet. + * + * Note that, for block sizes where we apply vectorization, vectors in A will + * always be 8-byte aligned. + */ +static inline vector_float vec_load_hinted(FLOAT const *restrict a) { + vector_float const *restrict addr = (vector_float const *restrict)a; + vector_float y; + +#if __GNUC__ < 9 + // hex-encode vl %[out],%[addr],3 + asm(".insn vrx,0xe70000003006,%[out],%[addr],3" + : [ out ] "=v"(y) + : [ addr ] "R"(*addr)); +#else + y = *addr; +#endif + + return y; +} + +/** + * Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics. + * + * @param[in] A Pointer current block of input matrix A. + * @param[in] k Number of columns in A. + * @param[in] B Pointer current block of input matrix B. + * @param[inout] C Pointer current block of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + */ +#define VECTOR_BLOCK(ROWS, COLS) \ + static inline void GEBP_block_##ROWS##_##COLS( \ + FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, \ + FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \ + _Static_assert( \ + ROWS % VLEN_FLOATS == 0, \ + "rows in block must be multiples of vector length"); \ + vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \ + \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] = vec_splats(ZERO); \ + \ + /* \ + * Stream over the row-block of A, which is packed \ + * column-by-column, multiply by coefficients in B and add up \ + * into temporaries Caux (which the compiler will hold in \ + * registers). Vectorization: Multiply column vectors from A \ + * with scalars from B and add up in column vectors of Caux. \ + * That equates to unrolling the loop over rows (in i) and \ + * executing each unrolled iteration as a vector element. \ + */ \ + for (BLASLONG k = 0; k < bk; k++) { \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float Ak = vec_load_hinted( \ + A + i * VLEN_FLOATS + k * ROWS); \ + \ + for (BLASLONG j = 0; j < COLS; j++) \ + Caux[i][j] += Ak * B[j + k * COLS]; \ + } \ + } \ + \ + /* \ + * Unpack row-block of C_aux into outer C_i, multiply by \ + * alpha and add up. \ + */ \ + for (BLASLONG j = 0; j < COLS; j++) { \ + for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \ + vector_float *C_ij = \ + (vector_float *)(C + i * VLEN_FLOATS + \ + j * ldc); \ + if (trmm) { \ + *C_ij = alpha * Caux[i][j]; \ + } else { \ + *C_ij += alpha * Caux[i][j]; \ + } \ + } \ + } \ + } + + +#if UNROLL_M == 16 +VECTOR_BLOCK(16, 4) +VECTOR_BLOCK(16, 2) +VECTOR_BLOCK(16, 1) +#endif +#if UNROLL_N == 8 +VECTOR_BLOCK(8, 8) +VECTOR_BLOCK(4, 8) +#endif +VECTOR_BLOCK(8, 4) +VECTOR_BLOCK(8, 2) +VECTOR_BLOCK(8, 1) +VECTOR_BLOCK(4, 4) +VECTOR_BLOCK(4, 2) +VECTOR_BLOCK(4, 1) + +#ifdef DOUBLE +VECTOR_BLOCK(2, 4) +VECTOR_BLOCK(2, 2) +#endif + +/** + * Handle calculation for row blocks in C_i of any size by dispatching into + * macro-defined (inline) functions or by deferring to a simple generic + * implementation. Note that the compiler can remove this awkward-looking + * dispatching code while inlineing. + * + * @param[in] m Number of rows in block C_i. + * @param[in] n Number of columns in block C_i. + * @param[in] first_row Index of first row of the block C_i (relative to C). + * @param[in] A Pointer to input matrix A (note: all of it). + * @param[in] k Number of columns in A and rows in B. + * @param[in] B Pointer to current column block (panel) of input matrix B. + * @param[inout] C Pointer to current column block (panel) of output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + * @param[in] off Running offset for handling triangular matrices. + */ +static inline void GEBP_block(BLASLONG m, BLASLONG n, + BLASLONG first_row, + const FLOAT * restrict A, BLASLONG k, + const FLOAT * restrict B, + FLOAT *restrict C, BLASLONG ldc, + FLOAT alpha, + BLASLONG offset, BLASLONG off) +{ + if (trmm && left) + off = offset + first_row; + + A += first_row * k; + C += first_row; + + if (trmm) { + if (backwards) { + A += off * m; + B += off * n; + k -= off; + } else { + if (left) { + k = off + m; + } else { + k = off + n; + } + } + } + +#define BLOCK(bm, bn) \ + if (m == bm && n == bn) { \ + GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \ + return; \ + } + +#if UNROLL_M == 16 + BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1); +#endif +#if UNROLL_N == 8 + BLOCK(8, 8); BLOCK(4, 8); +#endif + BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1); + BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1); + + #ifdef DOUBLE + BLOCK(2, 4); + BLOCK(2, 2); + #endif + +#undef BLOCK + + /* simple implementation for smaller block sizes: */ + FLOAT Caux[m][n] __attribute__ ((aligned (16))); + + /* + * Peel off first iteration (i.e., column of A) for initializing Caux + */ + for (BLASLONG i = 0; i < m; i++) + for (BLASLONG j = 0; j < n; j++) + Caux[i][j] = A[i] * B[j]; + + for (BLASLONG kk = 1; kk < k; kk++) + for (BLASLONG i = 0; i < m; i++) + for (BLASLONG j = 0; j < n; j++) + Caux[i][j] += A[i + kk * m] * B[j + kk * n]; + + for (BLASLONG i = 0; i < m; i++) + for (BLASLONG j = 0; j < n; j++) + if (trmm) { + C[i + j * ldc] = alpha * Caux[i][j]; + } else { + C[i + j * ldc] += alpha * Caux[i][j]; + } +} + +/** + * Handle a column block (panel) of C and B while calculating C += alpha(A * B). + * + * @param[in] num_cols Number of columns in the block (in C and B). + * @param[in] first_col First column of the current block (in C and B). + * @param[in] A Pointer to input matrix A. + * @param[in] bk Number of columns in A and rows in B. + * @param[in] B Pointer to input matrix B (note: all of it). + * @param[in] bm Number of rows in C and A. + * @param[inout] C Pointer to output matrix C (note: all of it). + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] alpha Scalar factor. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + */ +static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col, + const FLOAT *restrict A, BLASLONG bk, + const FLOAT *restrict B, BLASLONG bm, + FLOAT *restrict C, BLASLONG ldc, + FLOAT alpha, + BLASLONG const offset) { + + FLOAT *restrict C_i = C + first_col * ldc; + /* + * B is in column-order with n_r packed row elements, which does + * not matter -- we always move in full such blocks of + * column*pack + */ + const FLOAT *restrict B_i = B + first_col * bk; + + BLASLONG off = 0; + if (trmm) { + if (left) { + off = offset; + } else { + off = -offset + first_col; + } + } + + /* + * Calculate C_aux := A * B_j + * then unpack C_i += alpha * C_aux. + * + * For that purpose, further partition C_aux and A into blocks + * of m_r (unroll_m) rows, or powers-of-2 if smaller. + */ + BLASLONG row = 0; + for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2) + for (; bm - row >= block_size; row += block_size) + GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i, + ldc, alpha, offset, off); +} + +/** + * Inner kernel for matrix-matrix multiplication. C += alpha (A * B) + * where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and + * C are pointers to submatrices of the actual matrices. + * + * For triangular matrix multiplication, calculate B := alpha (A * B) where A + * or B can be triangular (in case of A, the macro LEFT will be defined). + * + * @param[in] bm Number of rows in C and A. + * @param[in] bn Number of columns in C and B. + * @param[in] bk Number of columns in A and rows in B. + * @param[in] alpha Scalar factor. + * @param[in] ba Pointer to input matrix A. + * @param[in] bb Pointer to input matrix B. + * @param[inout] C Pointer to output matrix C. + * @param[in] ldc Offset between elements in adjacent columns in C. + * @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices). + * @returns 0 on success. + */ +int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, + FLOAT *restrict ba, FLOAT *restrict bb, + FLOAT *restrict C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO)) + return 0; + + /* + * interface code allocates buffers for ba and bb at page + * granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler + * to make use of the fact in vector load operations. + */ + ba = __builtin_assume_aligned(ba, 16); + bb = __builtin_assume_aligned(bb, 16); + + /* + * Use offset and off even when compiled as SGEMMKERNEL to simplify + * function signatures and function calls. + */ +#ifndef TRMMKERNEL + BLASLONG const offset = 0; +#endif + + /* + * Partition B and C into blocks of n_r (unroll_n) columns, called B_i + * and C_i. For each partition, calculate C_i += alpha * (A * B_j). + * + * For remaining columns that do not fill up a block of n_r, iteratively + * use smaller block sizes of powers of 2. + */ + BLASLONG col = 0; + for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2) + for (; bn - col >= block_size; col += block_size) + GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset); + + return 0; +} diff --git a/param.h b/param.h index 7094249e8..6f0a3b727 100644 --- a/param.h +++ b/param.h @@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d #define GEMM_DEFAULT_OFFSET_B 0 #define GEMM_DEFAULT_ALIGN 0x03fffUL -#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_M 16 #define SGEMM_DEFAULT_UNROLL_N 4 #define DGEMM_DEFAULT_UNROLL_M 8 diff --git a/test/compare_sgemm_shgemm.c b/test/compare_sgemm_shgemm.c index d5bd84b91..7e254f844 100644 --- a/test/compare_sgemm_shgemm.c +++ b/test/compare_sgemm_shgemm.c @@ -46,6 +46,27 @@ typedef union } bits; } bfloat16_bits; +typedef union +{ + float v; + struct + { + uint32_t m:23; + uint32_t e:8; + uint32_t s:1; + } bits; +} float32_bits; + +float +float16to32 (bfloat16_bits f16) +{ + float32_bits f32; + f32.bits.s = f16.bits.s; + f32.bits.e = f16.bits.e; + f32.bits.m = (uint32_t) f16.bits.m << 16; + return f32.v; +} + int main (int argc, char *argv[]) { @@ -55,8 +76,6 @@ main (int argc, char *argv[]) int loop = 100; char transA = 'N', transB = 'N'; float alpha = 1.0, beta = 0.0; - char transa = 'N'; - char transb = 'N'; for (int x = 0; x <= loop; x++) { @@ -65,30 +84,45 @@ main (int argc, char *argv[]) float B[k * n]; float C[m * n]; bfloat16_bits AA[m * k], BB[k * n]; - float CC[m * n]; + float DD[m * n], CC[m * n]; for (int j = 0; j < m; j++) { for (int i = 0; i < m; i++) { - A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; - B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5; + A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; + B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5; C[j * k + i] = 0; AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16; BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16; CC[j * k + i] = 0; + DD[j * k + i] = 0; } } SGEMM (&transA, &transB, &m, &n, &k, &alpha, A, - &m, B, &k, &beta, C, &m); + &m, B, &k, &beta, C, &m); SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA, - &m, BB, &k, &beta, CC, &m); - + &m, BB, &k, &beta, CC, &m); for (i = 0; i < n; i++) - for (j = 0; j < m; j++) - for (l = 0; l < k; l++) - if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0) - ret++; + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0) + ret++; + if (transA == 'N' && transB == 'N') + { + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + { + DD[i * m + j] += + float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]); + } + for (i = 0; i < n; i++) + for (j = 0; j < m; j++) + for (l = 0; l < k; l++) + if (CC[i * m + j] != DD[i * m + j]) + ret++; + } } if (ret != 0) fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret);