commit
2931feb575
|
@ -184,3 +184,6 @@ In chronological order:
|
|||
* Rajalakshmi Srinivasaraghavan <https://github.com/RajalakshmiSR>
|
||||
* [2020-04-15] Half-precision GEMM for bfloat16
|
||||
|
||||
* Marius Hillenbrand <https://github.com/mhillenibm>
|
||||
* [2020-05-12] Revise dynamic architecture detection for IBM z
|
||||
* [2020-05-12] Add new sgemm and strmm kernel for IBM z14
|
||||
|
|
|
@ -563,8 +563,27 @@ DYNAMIC_CORE += EMAG8180
|
|||
endif
|
||||
|
||||
ifeq ($(ARCH), zarch)
|
||||
DYNAMIC_CORE = Z13
|
||||
DYNAMIC_CORE = ZARCH_GENERIC
|
||||
|
||||
# Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer
|
||||
GCC_GE_52 := $(subst 0,,$(shell expr `$(CC) -dumpversion` \>= "5.2"))
|
||||
|
||||
ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release)
|
||||
RHEL_WITH_Z13 := $(subst 0,,$(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"))
|
||||
endif
|
||||
|
||||
ifeq ($(or $(GCC_GE_52),$(RHEL_WITH_Z13)), 1)
|
||||
DYNAMIC_CORE += Z13
|
||||
else
|
||||
$(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x)
|
||||
endif
|
||||
|
||||
GCC_MAJOR_GE_7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
|
||||
ifeq ($(GCC_MAJOR_GE_7), 1)
|
||||
DYNAMIC_CORE += Z14
|
||||
else
|
||||
$(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x)
|
||||
endif
|
||||
endif
|
||||
|
||||
ifeq ($(ARCH), power)
|
||||
|
@ -855,7 +874,7 @@ ifneq ($(INTERFACE64), 0)
|
|||
FCOMMON_OPT += -i8
|
||||
endif
|
||||
endif
|
||||
FCOMMON_OPT += -recursive
|
||||
FCOMMON_OPT += -recursive -fp-model strict -assume protect-parens
|
||||
ifeq ($(USE_OPENMP), 1)
|
||||
FCOMMON_OPT += -fopenmp
|
||||
endif
|
||||
|
|
|
@ -5,6 +5,6 @@ FCOMMON_OPT += -march=z13 -mzvector
|
|||
endif
|
||||
|
||||
ifeq ($(CORE), Z14)
|
||||
CCOMMON_OPT += -march=z14 -mzvector
|
||||
CCOMMON_OPT += -march=z14 -mzvector -O3
|
||||
FCOMMON_OPT += -march=z14 -mzvector
|
||||
endif
|
||||
|
|
|
@ -49,6 +49,12 @@ else
|
|||
GOTO_LAPACK_TARGETS=
|
||||
endif
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
GOTO_HALF_TARGETS=shgemm.goto
|
||||
else
|
||||
GOTO_HALF_TARGETS=
|
||||
endif
|
||||
|
||||
ifeq ($(OSNAME), WINNT)
|
||||
|
||||
goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
||||
|
@ -91,7 +97,7 @@ goto :: slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
|
|||
sgetri.goto dgetri.goto cgetri.goto zgetri.goto \
|
||||
spotrf.goto dpotrf.goto cpotrf.goto zpotrf.goto \
|
||||
ssymm.goto dsymm.goto csymm.goto zsymm.goto \
|
||||
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto
|
||||
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto $(GOTO_HALF_TARGETS)
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
|
@ -264,7 +270,7 @@ goto :: sgemm.goto dgemm.goto cgemm.goto zgemm.goto \
|
|||
samin.goto damin.goto camin.goto zamin.goto \
|
||||
smin.goto dmin.goto \
|
||||
saxpby.goto daxpby.goto caxpby.goto zaxpby.goto \
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS)
|
||||
snrm2.goto dnrm2.goto scnrm2.goto dznrm2.goto $(GOTO_LAPACK_TARGETS) $(GOTO_HALF_TARGETS)
|
||||
|
||||
acml :: slinpack.acml dlinpack.acml clinpack.acml zlinpack.acml \
|
||||
scholesky.acml dcholesky.acml ccholesky.acml zcholesky.acml \
|
||||
|
@ -614,6 +620,11 @@ zcholesky.essl : zcholesky.$(SUFFIX)
|
|||
-$(CC) $(CFLAGS) -o $(@F) $^ $(LIBESSL) $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB)
|
||||
|
||||
##################################### Sgemm ####################################################
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
shgemm.goto : shgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
endif
|
||||
|
||||
sgemm.goto : sgemm.$(SUFFIX) ../$(LIBNAME)
|
||||
$(CC) $(CFLAGS) -o $(@F) $^ $(CEXTRALIB) $(EXTRALIB) $(FEXTRALIB) -lm
|
||||
|
||||
|
@ -2916,6 +2927,11 @@ ccholesky.$(SUFFIX) : cholesky.c
|
|||
zcholesky.$(SUFFIX) : cholesky.c
|
||||
$(CC) $(CFLAGS) -c -DCOMPLEX -DDOUBLE -o $(@F) $^
|
||||
|
||||
ifeq ($(BUILD_HALF),1)
|
||||
shgemm.$(SUFFIX) : gemm.c
|
||||
$(CC) $(CFLAGS) -c -DHALF -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
endif
|
||||
|
||||
sgemm.$(SUFFIX) : gemm.c
|
||||
$(CC) $(CFLAGS) -c -UCOMPLEX -UDOUBLE -o $(@F) $^
|
||||
|
||||
|
|
|
@ -39,6 +39,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#ifdef DOUBLE
|
||||
#define GEMM BLASFUNC(dgemm)
|
||||
#elif defined(HALF)
|
||||
#define GEMM BLASFUNC(shgemm)
|
||||
#else
|
||||
#define GEMM BLASFUNC(sgemm)
|
||||
#endif
|
||||
|
@ -120,7 +122,8 @@ static void *huge_malloc(BLASLONG size){
|
|||
|
||||
int main(int argc, char *argv[]){
|
||||
|
||||
FLOAT *a, *b, *c;
|
||||
IFLOAT *a, *b;
|
||||
FLOAT *c;
|
||||
FLOAT alpha[] = {1.0, 0.0};
|
||||
FLOAT beta [] = {0.0, 0.0};
|
||||
char transa = 'N';
|
||||
|
@ -184,10 +187,10 @@ int main(int argc, char *argv[]){
|
|||
k = to;
|
||||
}
|
||||
|
||||
if (( a = (FLOAT *)malloc(sizeof(FLOAT) * m * k * COMPSIZE)) == NULL) {
|
||||
if (( a = (IFLOAT *)malloc(sizeof(IFLOAT) * m * k * COMPSIZE)) == NULL) {
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
if (( b = (FLOAT *)malloc(sizeof(FLOAT) * k * n * COMPSIZE)) == NULL) {
|
||||
if (( b = (IFLOAT *)malloc(sizeof(IFLOAT) * k * n * COMPSIZE)) == NULL) {
|
||||
fprintf(stderr,"Out of Memory!!\n");exit(1);
|
||||
}
|
||||
if (( c = (FLOAT *)malloc(sizeof(FLOAT) * m * n * COMPSIZE)) == NULL) {
|
||||
|
@ -199,10 +202,10 @@ int main(int argc, char *argv[]){
|
|||
#endif
|
||||
|
||||
for (i = 0; i < m * k * COMPSIZE; i++) {
|
||||
a[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
a[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
for (i = 0; i < k * n * COMPSIZE; i++) {
|
||||
b[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
b[i] = ((IFLOAT) rand() / (IFLOAT) RAND_MAX) - 0.5;
|
||||
}
|
||||
for (i = 0; i < m * n * COMPSIZE; i++) {
|
||||
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
|
||||
|
|
|
@ -43,7 +43,8 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
if (NOT "${line_match}" STREQUAL "")
|
||||
#message(STATUS "match on ${line_match}")
|
||||
set(var_name ${CMAKE_MATCH_1})
|
||||
set(var_value ${CMAKE_MATCH_2})
|
||||
# set(var_value ${CMAKE_MATCH_2})
|
||||
string(STRIP ${CMAKE_MATCH_2} var_value)
|
||||
# check for Makefile variables in the string, e.g. $(TSUFFIX)
|
||||
string(REGEX MATCHALL "\\$\\(([0-9_a-zA-Z]+)\\)" make_var_matches ${var_value})
|
||||
foreach (make_var ${make_var_matches})
|
||||
|
@ -63,7 +64,7 @@ macro(ParseMakefileVars MAKEFILE_IN)
|
|||
string(REGEX MATCH "ifeq \\(\\$\\(([_A-Z]+)\\),[ \t]*([0-9_A-Z]+)\\)" line_match "${makefile_line}")
|
||||
if (NOT "${line_match}" STREQUAL "")
|
||||
# message(STATUS "IFEQ: ${line_match} first: ${CMAKE_MATCH_1} second: ${CMAKE_MATCH_2}")
|
||||
if (${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
|
||||
if (DEFINED ${${CMAKE_MATCH_1}} AND ${${CMAKE_MATCH_1}} STREQUAL ${CMAKE_MATCH_2})
|
||||
# message (STATUS "condition is true")
|
||||
set (IfElse 1)
|
||||
else ()
|
||||
|
|
|
@ -1,12 +1,58 @@
|
|||
|
||||
#include "common.h"
|
||||
#include <stdbool.h>
|
||||
|
||||
// Gate kernels for z13 and z14 on gcc version
|
||||
#if (__GNUC__ == 5 && __GNUC_MINOR__ >= 2) || __GNUC__ >= 6 || \
|
||||
/* RHEL 7 since 7.3: */ \
|
||||
(__GNUC__ == 4 && __GNUC_MINOR__ == 8 && __GNUC_PATCHLEVEL__ == 5 && \
|
||||
__GNUC_RH_RELEASE__ >= 11)
|
||||
#define HAVE_Z13_SUPPORT
|
||||
#endif
|
||||
|
||||
#if __GNUC__ >= 7
|
||||
#define HAVE_Z14_SUPPORT
|
||||
#endif
|
||||
|
||||
// Guard the use of getauxval() on glibc version >= 2.16
|
||||
#ifdef __GLIBC__
|
||||
#include <features.h>
|
||||
#if __GLIBC_PREREQ(2, 16)
|
||||
#include <sys/auxv.h>
|
||||
#define HAVE_GETAUXVAL 1
|
||||
|
||||
static unsigned long get_hwcap(void)
|
||||
{
|
||||
unsigned long hwcap = getauxval(AT_HWCAP);
|
||||
char *maskenv;
|
||||
|
||||
// honor requests for not using specific CPU features in LD_HWCAP_MASK
|
||||
maskenv = getenv("LD_HWCAP_MASK");
|
||||
if (maskenv)
|
||||
hwcap &= strtoul(maskenv, NULL, 0);
|
||||
|
||||
return hwcap;
|
||||
// note that a missing auxval is interpreted as no capabilities
|
||||
// available, which is safe.
|
||||
}
|
||||
|
||||
#else // __GLIBC_PREREQ(2, 16)
|
||||
#warn "Cannot detect SIMD support in Z13 or newer architectures since glibc is older than 2.16"
|
||||
|
||||
static unsigned long get_hwcap(void) {
|
||||
// treat missing support for getauxval() as no capabilities available,
|
||||
// which is safe.
|
||||
return 0;
|
||||
}
|
||||
#endif // __GLIBC_PREREQ(2, 16)
|
||||
#endif // __GLIBC
|
||||
|
||||
extern gotoblas_t gotoblas_ZARCH_GENERIC;
|
||||
#ifdef HAVE_Z13_SUPPORT
|
||||
extern gotoblas_t gotoblas_Z13;
|
||||
#endif
|
||||
#ifdef HAVE_Z14_SUPPORT
|
||||
extern gotoblas_t gotoblas_Z14;
|
||||
//extern gotoblas_t gotoblas_Z15;
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
//extern gotoblas_t gotoblas_Z14;
|
||||
//#endif
|
||||
#endif
|
||||
|
||||
#define NUM_CORETYPES 4
|
||||
|
||||
|
@ -16,47 +62,50 @@ static char* corename[] = {
|
|||
"unknown",
|
||||
"Z13",
|
||||
"Z14",
|
||||
// "Z15",
|
||||
"ZARCH_GENERIC",
|
||||
};
|
||||
|
||||
char* gotoblas_corename(void) {
|
||||
#ifdef HAVE_Z13_SUPPORT
|
||||
if (gotoblas == &gotoblas_Z13) return corename[1];
|
||||
#endif
|
||||
#ifdef HAVE_Z14_SUPPORT
|
||||
if (gotoblas == &gotoblas_Z14) return corename[2];
|
||||
// if (gotoblas == &gotoblas_Z15) return corename[3];
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// if (gotoblas == &gotoblas_POWER9) return corename[3];
|
||||
//#endif
|
||||
return corename[0]; // try generic?
|
||||
#endif
|
||||
if (gotoblas == &gotoblas_ZARCH_GENERIC) return corename[3];
|
||||
|
||||
return corename[0];
|
||||
}
|
||||
|
||||
// __builtin_cpu_is is not supported by zarch
|
||||
/**
|
||||
* Detect the fitting set of kernels by retrieving the CPU features supported by
|
||||
* OS from the auxiliary value AT_HWCAP and choosing the set of kernels
|
||||
* ("coretype") that exploits most of the features and can be compiled with the
|
||||
* available gcc version.
|
||||
* Note that we cannot use vector registers on a z13 or newer unless supported
|
||||
* by the OS kernel (which needs to handle them properly during context switch).
|
||||
*/
|
||||
static gotoblas_t* get_coretype(void) {
|
||||
FILE* infile;
|
||||
char buffer[512], * p;
|
||||
|
||||
p = (char*)NULL;
|
||||
infile = fopen("/proc/sysinfo", "r");
|
||||
while (fgets(buffer, sizeof(buffer), infile)) {
|
||||
if (!strncmp("Type", buffer, 4)) {
|
||||
p = strchr(buffer, ':') + 2;
|
||||
#if 0
|
||||
fprintf(stderr, "%s\n", p);
|
||||
unsigned long hwcap __attribute__((unused)) = get_hwcap();
|
||||
|
||||
// z14 and z15 systems: exploit Vector Facility (SIMD) and
|
||||
// Vector-Enhancements Facility 1 (float SIMD instructions), if present.
|
||||
#ifdef HAVE_Z14_SUPPORT
|
||||
if ((hwcap & HWCAP_S390_VX) && (hwcap & HWCAP_S390_VXE))
|
||||
return &gotoblas_Z14;
|
||||
#endif
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
fclose(infile);
|
||||
// z13: Vector Facility (SIMD for double)
|
||||
#ifdef HAVE_Z13_SUPPORT
|
||||
if (hwcap & HWCAP_S390_VX)
|
||||
return &gotoblas_Z13;
|
||||
#endif
|
||||
|
||||
if (strstr(p, "2964")) return &gotoblas_Z13;
|
||||
if (strstr(p, "2965")) return &gotoblas_Z13;
|
||||
if (strstr(p, "3906")) return &gotoblas_Z14;
|
||||
if (strstr(p, "3907")) return &gotoblas_Z14;
|
||||
if (strstr(p, "8561")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
if (strstr(p, "8562")) return &gotoblas_Z14; // fallback z15 to z14
|
||||
|
||||
return NULL; // should be ZARCH_GENERIC
|
||||
// fallback in case of missing compiler support, systems before z13, or
|
||||
// when the OS does not advertise support for the Vector Facility (e.g.,
|
||||
// missing support in the OS kernel)
|
||||
return &gotoblas_ZARCH_GENERIC;
|
||||
}
|
||||
|
||||
static gotoblas_t* force_coretype(char* coretype) {
|
||||
|
@ -76,12 +125,13 @@ static gotoblas_t* force_coretype(char* coretype) {
|
|||
|
||||
switch (found)
|
||||
{
|
||||
#ifdef HAVE_Z13_SUPPORT
|
||||
case 1: return (&gotoblas_Z13);
|
||||
#endif
|
||||
#ifdef HAVE_Z14_SUPPORT
|
||||
case 2: return (&gotoblas_Z14);
|
||||
// case 3: return (&gotoblas_Z15);
|
||||
//#if (!defined C_GCC) || (GCC_VERSION >= 60000)
|
||||
// case 3: return (&gotoblas_POWER9);
|
||||
//#endif
|
||||
#endif
|
||||
case 3: return (&gotoblas_ZARCH_GENERIC);
|
||||
default: return NULL;
|
||||
}
|
||||
snprintf(message, 128, "Core not found: %s\n", coretype);
|
||||
|
@ -109,9 +159,9 @@ void gotoblas_dynamic_init(void) {
|
|||
|
||||
if (gotoblas == NULL)
|
||||
{
|
||||
snprintf(coremsg, 128, "Falling back to Z14 core\n");
|
||||
snprintf(coremsg, 128, "Failed to detect system, falling back to generic z support.\n");
|
||||
openblas_warning(1, coremsg);
|
||||
gotoblas = &gotoblas_Z14;
|
||||
gotoblas = &gotoblas_ZARCH_GENERIC;
|
||||
}
|
||||
|
||||
if (gotoblas && gotoblas->init) {
|
||||
|
|
|
@ -24,3 +24,6 @@ DGEMM_BETA = dgemm_beta_skylakex.c
|
|||
|
||||
CGEMMKERNEL = cgemm_kernel_8x2_skylakex.c
|
||||
ZGEMMKERNEL = zgemm_kernel_4x2_skylakex.c
|
||||
|
||||
CSCALKERNEL = ../arm/zscal.c
|
||||
ZSCALKERNEL = ../arm/zscal.c
|
||||
|
|
|
@ -86,23 +86,23 @@ DGEMVTKERNEL = dgemv_t_4.c
|
|||
CGEMVTKERNEL = cgemv_t_4.c
|
||||
ZGEMVTKERNEL = zgemv_t_4.c
|
||||
|
||||
STRMMKERNEL = strmm8x4V.S
|
||||
STRMMKERNEL = gemm_vec.c
|
||||
DTRMMKERNEL = trmm8x4V.S
|
||||
CTRMMKERNEL = ctrmm4x4V.S
|
||||
ZTRMMKERNEL = ztrmm4x4V.S
|
||||
|
||||
SGEMMKERNEL = strmm8x4V.S
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMKERNEL = gemm_vec.c
|
||||
ifneq ($(SGEMM_UNROLL_M),$(SGEMM_UNROLL_N))
|
||||
SGEMMINCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMITCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
|
||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||
endif
|
||||
SGEMMONCOPY = ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
|
||||
|
||||
DGEMMKERNEL = gemm8x4V.S
|
||||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||
|
@ -145,7 +145,3 @@ ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
|||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -0,0 +1,472 @@
|
|||
/*
|
||||
* Copyright (c) IBM Corporation 2020.
|
||||
* All rights reserved.
|
||||
*
|
||||
* Redistribution and use in source and binary forms, with or without
|
||||
* modification, are permitted provided that the following conditions are
|
||||
* met:
|
||||
*
|
||||
* 1. Redistributions of source code must retain the above copyright
|
||||
* notice, this list of conditions and the following disclaimer.
|
||||
*
|
||||
* 2. Redistributions in binary form must reproduce the above copyright
|
||||
* notice, this list of conditions and the following disclaimer in
|
||||
* the documentation and/or other materials provided with the
|
||||
* distribution.
|
||||
* 3. Neither the name of the OpenBLAS project nor the names of
|
||||
* its contributors may be used to endorse or promote products
|
||||
* derived from this software without specific prior written
|
||||
* permission.
|
||||
*
|
||||
* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||
* LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
* SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
* CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
* OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
* USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
#include "common.h"
|
||||
#include <vecintrin.h>
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdio.h>
|
||||
#include <stdlib.h>
|
||||
|
||||
#ifdef COMPLEX
|
||||
#error "Handling for complex numbers is not supported in this kernel"
|
||||
#endif
|
||||
|
||||
#ifdef DOUBLE
|
||||
#define UNROLL_M DGEMM_DEFAULT_UNROLL_M
|
||||
#define UNROLL_N DGEMM_DEFAULT_UNROLL_N
|
||||
#else
|
||||
#define UNROLL_M SGEMM_DEFAULT_UNROLL_M
|
||||
#define UNROLL_N SGEMM_DEFAULT_UNROLL_N
|
||||
#endif
|
||||
|
||||
static const size_t unroll_m = UNROLL_M;
|
||||
static const size_t unroll_n = UNROLL_N;
|
||||
|
||||
/* Handling of triangular matrices */
|
||||
#ifdef TRMMKERNEL
|
||||
static const bool trmm = true;
|
||||
static const bool left =
|
||||
#ifdef LEFT
|
||||
true;
|
||||
#else
|
||||
false;
|
||||
#endif
|
||||
|
||||
static const bool backwards =
|
||||
#if defined(LEFT) != defined(TRANSA)
|
||||
true;
|
||||
#else
|
||||
false;
|
||||
#endif
|
||||
|
||||
#else
|
||||
static const bool trmm = false;
|
||||
static const bool left = false;
|
||||
static const bool backwards = false;
|
||||
#endif /* TRMMKERNEL */
|
||||
|
||||
/*
|
||||
* Background:
|
||||
*
|
||||
* The algorithm of GotoBLAS / OpenBLAS breaks down the matrix multiplication
|
||||
* problem by splitting all matrices into partitions multiple times, so that the
|
||||
* submatrices fit into the L1 or L2 caches. As a result, each multiplication of
|
||||
* submatrices can stream data fast from L1 and L2 caches. Inbetween, it copies
|
||||
* and rearranges the submatrices to enable contiguous memory accesses to
|
||||
* improve locality in both caches and TLBs.
|
||||
*
|
||||
* At the heart of the algorithm is this kernel, which multiplies, a "Block
|
||||
* matrix" A (small dimensions) with a "Panel matrix" B (number of rows is
|
||||
* small) and adds the result into a "Panel matrix" C; GotoBLAS calls this
|
||||
* operation GEBP. This kernel further partitions GEBP twice, such that (1)
|
||||
* submatrices of C and B fit into the L1 caches (GEBP_column_block) and (2) a
|
||||
* block of C fits into the registers, while multiplying panels from A and B
|
||||
* streamed from the L2 and L1 cache, respectively (GEBP_block).
|
||||
*
|
||||
*
|
||||
* Algorithm GEBP(A, B, C, m, n, k, alpha):
|
||||
*
|
||||
* The problem is calculating C += alpha * (A * B)
|
||||
* C is an m x n matrix, A is an m x k matrix, B is an k x n matrix.
|
||||
*
|
||||
* - C is in column-major-order, with an offset of ldc to the element in the
|
||||
* next column (same row).
|
||||
* - A is in row-major-order yet stores SGEMM_UNROLL_M elements of each column
|
||||
* contiguously while walking along rows.
|
||||
* - B is in column-major-order but packs SGEMM_UNROLL_N elements of a row
|
||||
* contiguously.
|
||||
* If the numbers of rows and columns are not multiples of SGEMM_UNROLL_M or
|
||||
* SGEMM_UNROLL_N, the remaining elements are arranged in blocks with power-of-2
|
||||
* dimensions (e.g., 5 remaining columns would be in a block-of-4 and a
|
||||
* block-of-1).
|
||||
*
|
||||
* Note that packing A and B into that form is taken care of by the caller in
|
||||
* driver/level3/level3.c (actually done by "copy kernels").
|
||||
*
|
||||
* Steps:
|
||||
* - Partition C and B into blocks of n_r (SGEMM_UNROLL_N) columns, C_j and B_j.
|
||||
* Now, B_j should fit into the L1 cache.
|
||||
* - For each partition, calculate C_j += alpha * (A * B_j) by
|
||||
* (1) Calculate C_aux := A * B_j (see below)
|
||||
* (2) unpack C_j = C_j + alpha * C_aux
|
||||
*
|
||||
*
|
||||
* Algorithm for Calculating C_aux:
|
||||
*
|
||||
* - Further partition C_aux and A into groups of m_r (SGEMM_UNROLL_M) rows,
|
||||
* such that the m_r x n_r-submatrix of C_aux can be held in registers. Each
|
||||
* submatrix of C_aux can be calculated independently, and the registers are
|
||||
* added back into C_j.
|
||||
*
|
||||
* - For each row-block of C_aux:
|
||||
* (uses a row block of A and full B_j)
|
||||
* - stream over all columns of A, multiply with elements from B and
|
||||
* accumulate in registers. (use different inner-kernels to exploit
|
||||
* vectorization for varying block sizes)
|
||||
* - add alpha * row block of C_aux back into C_j.
|
||||
*
|
||||
* Note that there are additional mechanics for handling triangular matrices,
|
||||
* calculating B := alpha (A * B) where either of the matrices A or B can be
|
||||
* triangular. In case of A, the macro "LEFT" is defined. In addition, A can
|
||||
* optionally be transposed.
|
||||
* The code effectively skips an "offset" number of columns in A and rows of B
|
||||
* in each block, to save unnecessary work by exploiting the triangular nature.
|
||||
* To handle all cases, the code discerns (1) a "left" mode when A is triangular
|
||||
* and (2) "forward" / "backwards" modes where only the first "offset"
|
||||
* columns/rows of A/B are used or where the first "offset" columns/rows are
|
||||
* skipped, respectively.
|
||||
*
|
||||
* Reference:
|
||||
*
|
||||
* The summary above is based on staring at various kernel implementations and:
|
||||
* K. Goto and R. A. Van de Geijn, Anatomy of High-Performance Matrix
|
||||
* Multiplication, in ACM Transactions of Mathematical Software, Vol. 34, No.
|
||||
* 3, May 2008.
|
||||
*/
|
||||
|
||||
#define VLEN_BYTES 16
|
||||
#define VLEN_FLOATS (VLEN_BYTES / sizeof(FLOAT))
|
||||
|
||||
typedef FLOAT vector_float __attribute__ ((vector_size (16)));
|
||||
|
||||
/**
|
||||
* Load a vector into register, and hint on 8-byte alignment to improve
|
||||
* performance. gcc-9 and newer will create these hints by itself. For older
|
||||
* compiler versions, use inline assembly to explicitly express the hint.
|
||||
* Provide explicit hex encoding to cater for binutils versions that do not know
|
||||
* about vector-load with alignment hints yet.
|
||||
*
|
||||
* Note that, for block sizes where we apply vectorization, vectors in A will
|
||||
* always be 8-byte aligned.
|
||||
*/
|
||||
static inline vector_float vec_load_hinted(FLOAT const *restrict a) {
|
||||
vector_float const *restrict addr = (vector_float const *restrict)a;
|
||||
vector_float y;
|
||||
|
||||
#if __GNUC__ < 9
|
||||
// hex-encode vl %[out],%[addr],3
|
||||
asm(".insn vrx,0xe70000003006,%[out],%[addr],3"
|
||||
: [ out ] "=v"(y)
|
||||
: [ addr ] "R"(*addr));
|
||||
#else
|
||||
y = *addr;
|
||||
#endif
|
||||
|
||||
return y;
|
||||
}
|
||||
|
||||
/**
|
||||
* Calculate for a row-block in C_i of size ROWSxCOLS using vector intrinsics.
|
||||
*
|
||||
* @param[in] A Pointer current block of input matrix A.
|
||||
* @param[in] k Number of columns in A.
|
||||
* @param[in] B Pointer current block of input matrix B.
|
||||
* @param[inout] C Pointer current block of output matrix C.
|
||||
* @param[in] ldc Offset between elements in adjacent columns in C.
|
||||
* @param[in] alpha Scalar factor.
|
||||
*/
|
||||
#define VECTOR_BLOCK(ROWS, COLS) \
|
||||
static inline void GEBP_block_##ROWS##_##COLS( \
|
||||
FLOAT const *restrict A, BLASLONG bk, FLOAT const *restrict B, \
|
||||
FLOAT *restrict C, BLASLONG ldc, FLOAT alpha) { \
|
||||
_Static_assert( \
|
||||
ROWS % VLEN_FLOATS == 0, \
|
||||
"rows in block must be multiples of vector length"); \
|
||||
vector_float Caux[ROWS / VLEN_FLOATS][COLS]; \
|
||||
\
|
||||
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) \
|
||||
for (BLASLONG j = 0; j < COLS; j++) \
|
||||
Caux[i][j] = vec_splats(ZERO); \
|
||||
\
|
||||
/* \
|
||||
* Stream over the row-block of A, which is packed \
|
||||
* column-by-column, multiply by coefficients in B and add up \
|
||||
* into temporaries Caux (which the compiler will hold in \
|
||||
* registers). Vectorization: Multiply column vectors from A \
|
||||
* with scalars from B and add up in column vectors of Caux. \
|
||||
* That equates to unrolling the loop over rows (in i) and \
|
||||
* executing each unrolled iteration as a vector element. \
|
||||
*/ \
|
||||
for (BLASLONG k = 0; k < bk; k++) { \
|
||||
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
||||
vector_float Ak = vec_load_hinted( \
|
||||
A + i * VLEN_FLOATS + k * ROWS); \
|
||||
\
|
||||
for (BLASLONG j = 0; j < COLS; j++) \
|
||||
Caux[i][j] += Ak * B[j + k * COLS]; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
/* \
|
||||
* Unpack row-block of C_aux into outer C_i, multiply by \
|
||||
* alpha and add up. \
|
||||
*/ \
|
||||
for (BLASLONG j = 0; j < COLS; j++) { \
|
||||
for (BLASLONG i = 0; i < ROWS / VLEN_FLOATS; i++) { \
|
||||
vector_float *C_ij = \
|
||||
(vector_float *)(C + i * VLEN_FLOATS + \
|
||||
j * ldc); \
|
||||
if (trmm) { \
|
||||
*C_ij = alpha * Caux[i][j]; \
|
||||
} else { \
|
||||
*C_ij += alpha * Caux[i][j]; \
|
||||
} \
|
||||
} \
|
||||
} \
|
||||
}
|
||||
|
||||
|
||||
#if UNROLL_M == 16
|
||||
VECTOR_BLOCK(16, 4)
|
||||
VECTOR_BLOCK(16, 2)
|
||||
VECTOR_BLOCK(16, 1)
|
||||
#endif
|
||||
#if UNROLL_N == 8
|
||||
VECTOR_BLOCK(8, 8)
|
||||
VECTOR_BLOCK(4, 8)
|
||||
#endif
|
||||
VECTOR_BLOCK(8, 4)
|
||||
VECTOR_BLOCK(8, 2)
|
||||
VECTOR_BLOCK(8, 1)
|
||||
VECTOR_BLOCK(4, 4)
|
||||
VECTOR_BLOCK(4, 2)
|
||||
VECTOR_BLOCK(4, 1)
|
||||
|
||||
#ifdef DOUBLE
|
||||
VECTOR_BLOCK(2, 4)
|
||||
VECTOR_BLOCK(2, 2)
|
||||
#endif
|
||||
|
||||
/**
|
||||
* Handle calculation for row blocks in C_i of any size by dispatching into
|
||||
* macro-defined (inline) functions or by deferring to a simple generic
|
||||
* implementation. Note that the compiler can remove this awkward-looking
|
||||
* dispatching code while inlineing.
|
||||
*
|
||||
* @param[in] m Number of rows in block C_i.
|
||||
* @param[in] n Number of columns in block C_i.
|
||||
* @param[in] first_row Index of first row of the block C_i (relative to C).
|
||||
* @param[in] A Pointer to input matrix A (note: all of it).
|
||||
* @param[in] k Number of columns in A and rows in B.
|
||||
* @param[in] B Pointer to current column block (panel) of input matrix B.
|
||||
* @param[inout] C Pointer to current column block (panel) of output matrix C.
|
||||
* @param[in] ldc Offset between elements in adjacent columns in C.
|
||||
* @param[in] alpha Scalar factor.
|
||||
* @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
|
||||
* @param[in] off Running offset for handling triangular matrices.
|
||||
*/
|
||||
static inline void GEBP_block(BLASLONG m, BLASLONG n,
|
||||
BLASLONG first_row,
|
||||
const FLOAT * restrict A, BLASLONG k,
|
||||
const FLOAT * restrict B,
|
||||
FLOAT *restrict C, BLASLONG ldc,
|
||||
FLOAT alpha,
|
||||
BLASLONG offset, BLASLONG off)
|
||||
{
|
||||
if (trmm && left)
|
||||
off = offset + first_row;
|
||||
|
||||
A += first_row * k;
|
||||
C += first_row;
|
||||
|
||||
if (trmm) {
|
||||
if (backwards) {
|
||||
A += off * m;
|
||||
B += off * n;
|
||||
k -= off;
|
||||
} else {
|
||||
if (left) {
|
||||
k = off + m;
|
||||
} else {
|
||||
k = off + n;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#define BLOCK(bm, bn) \
|
||||
if (m == bm && n == bn) { \
|
||||
GEBP_block_##bm##_##bn(A, k, B, C, ldc, alpha); \
|
||||
return; \
|
||||
}
|
||||
|
||||
#if UNROLL_M == 16
|
||||
BLOCK(16, 4); BLOCK(16, 2); BLOCK(16, 1);
|
||||
#endif
|
||||
#if UNROLL_N == 8
|
||||
BLOCK(8, 8); BLOCK(4, 8);
|
||||
#endif
|
||||
BLOCK(8, 4); BLOCK(8, 2); BLOCK(8, 1);
|
||||
BLOCK(4, 4); BLOCK(4, 2); BLOCK(4, 1);
|
||||
|
||||
#ifdef DOUBLE
|
||||
BLOCK(2, 4);
|
||||
BLOCK(2, 2);
|
||||
#endif
|
||||
|
||||
#undef BLOCK
|
||||
|
||||
/* simple implementation for smaller block sizes: */
|
||||
FLOAT Caux[m][n] __attribute__ ((aligned (16)));
|
||||
|
||||
/*
|
||||
* Peel off first iteration (i.e., column of A) for initializing Caux
|
||||
*/
|
||||
for (BLASLONG i = 0; i < m; i++)
|
||||
for (BLASLONG j = 0; j < n; j++)
|
||||
Caux[i][j] = A[i] * B[j];
|
||||
|
||||
for (BLASLONG kk = 1; kk < k; kk++)
|
||||
for (BLASLONG i = 0; i < m; i++)
|
||||
for (BLASLONG j = 0; j < n; j++)
|
||||
Caux[i][j] += A[i + kk * m] * B[j + kk * n];
|
||||
|
||||
for (BLASLONG i = 0; i < m; i++)
|
||||
for (BLASLONG j = 0; j < n; j++)
|
||||
if (trmm) {
|
||||
C[i + j * ldc] = alpha * Caux[i][j];
|
||||
} else {
|
||||
C[i + j * ldc] += alpha * Caux[i][j];
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle a column block (panel) of C and B while calculating C += alpha(A * B).
|
||||
*
|
||||
* @param[in] num_cols Number of columns in the block (in C and B).
|
||||
* @param[in] first_col First column of the current block (in C and B).
|
||||
* @param[in] A Pointer to input matrix A.
|
||||
* @param[in] bk Number of columns in A and rows in B.
|
||||
* @param[in] B Pointer to input matrix B (note: all of it).
|
||||
* @param[in] bm Number of rows in C and A.
|
||||
* @param[inout] C Pointer to output matrix C (note: all of it).
|
||||
* @param[in] ldc Offset between elements in adjacent columns in C.
|
||||
* @param[in] alpha Scalar factor.
|
||||
* @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
|
||||
*/
|
||||
static inline void GEBP_column_block(BLASLONG num_cols, BLASLONG first_col,
|
||||
const FLOAT *restrict A, BLASLONG bk,
|
||||
const FLOAT *restrict B, BLASLONG bm,
|
||||
FLOAT *restrict C, BLASLONG ldc,
|
||||
FLOAT alpha,
|
||||
BLASLONG const offset) {
|
||||
|
||||
FLOAT *restrict C_i = C + first_col * ldc;
|
||||
/*
|
||||
* B is in column-order with n_r packed row elements, which does
|
||||
* not matter -- we always move in full such blocks of
|
||||
* column*pack
|
||||
*/
|
||||
const FLOAT *restrict B_i = B + first_col * bk;
|
||||
|
||||
BLASLONG off = 0;
|
||||
if (trmm) {
|
||||
if (left) {
|
||||
off = offset;
|
||||
} else {
|
||||
off = -offset + first_col;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Calculate C_aux := A * B_j
|
||||
* then unpack C_i += alpha * C_aux.
|
||||
*
|
||||
* For that purpose, further partition C_aux and A into blocks
|
||||
* of m_r (unroll_m) rows, or powers-of-2 if smaller.
|
||||
*/
|
||||
BLASLONG row = 0;
|
||||
for (BLASLONG block_size = unroll_m; block_size > 0; block_size /= 2)
|
||||
for (; bm - row >= block_size; row += block_size)
|
||||
GEBP_block(block_size, num_cols, row, A, bk, B_i, C_i,
|
||||
ldc, alpha, offset, off);
|
||||
}
|
||||
|
||||
/**
|
||||
* Inner kernel for matrix-matrix multiplication. C += alpha (A * B)
|
||||
* where C is an m-by-n matrix, A is m-by-k and B is k-by-n. Note that A, B, and
|
||||
* C are pointers to submatrices of the actual matrices.
|
||||
*
|
||||
* For triangular matrix multiplication, calculate B := alpha (A * B) where A
|
||||
* or B can be triangular (in case of A, the macro LEFT will be defined).
|
||||
*
|
||||
* @param[in] bm Number of rows in C and A.
|
||||
* @param[in] bn Number of columns in C and B.
|
||||
* @param[in] bk Number of columns in A and rows in B.
|
||||
* @param[in] alpha Scalar factor.
|
||||
* @param[in] ba Pointer to input matrix A.
|
||||
* @param[in] bb Pointer to input matrix B.
|
||||
* @param[inout] C Pointer to output matrix C.
|
||||
* @param[in] ldc Offset between elements in adjacent columns in C.
|
||||
* @param[in] offset Number of columns of A and rows of B to skip (for triangular matrices).
|
||||
* @returns 0 on success.
|
||||
*/
|
||||
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha,
|
||||
FLOAT *restrict ba, FLOAT *restrict bb,
|
||||
FLOAT *restrict C, BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
, BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
if ( (bm == 0) || (bn == 0) || (bk == 0) || (alpha == ZERO))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* interface code allocates buffers for ba and bb at page
|
||||
* granularity (i.e., using mmap(MAP_ANONYMOUS), so enable the compiler
|
||||
* to make use of the fact in vector load operations.
|
||||
*/
|
||||
ba = __builtin_assume_aligned(ba, 16);
|
||||
bb = __builtin_assume_aligned(bb, 16);
|
||||
|
||||
/*
|
||||
* Use offset and off even when compiled as SGEMMKERNEL to simplify
|
||||
* function signatures and function calls.
|
||||
*/
|
||||
#ifndef TRMMKERNEL
|
||||
BLASLONG const offset = 0;
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Partition B and C into blocks of n_r (unroll_n) columns, called B_i
|
||||
* and C_i. For each partition, calculate C_i += alpha * (A * B_j).
|
||||
*
|
||||
* For remaining columns that do not fill up a block of n_r, iteratively
|
||||
* use smaller block sizes of powers of 2.
|
||||
*/
|
||||
BLASLONG col = 0;
|
||||
for (BLASLONG block_size = unroll_n; block_size > 0; block_size /= 2)
|
||||
for (; bn - col >= block_size; col += block_size)
|
||||
GEBP_column_block(block_size, col, ba, bk, bb, bm, C, ldc, alpha, offset);
|
||||
|
||||
return 0;
|
||||
}
|
2
param.h
2
param.h
|
@ -2999,7 +2999,7 @@ is a big desktop or server with abundant cache rather than a phone or embedded d
|
|||
#define GEMM_DEFAULT_OFFSET_B 0
|
||||
#define GEMM_DEFAULT_ALIGN 0x03fffUL
|
||||
|
||||
#define SGEMM_DEFAULT_UNROLL_M 8
|
||||
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
|
|
|
@ -46,6 +46,27 @@ typedef union
|
|||
} bits;
|
||||
} bfloat16_bits;
|
||||
|
||||
typedef union
|
||||
{
|
||||
float v;
|
||||
struct
|
||||
{
|
||||
uint32_t m:23;
|
||||
uint32_t e:8;
|
||||
uint32_t s:1;
|
||||
} bits;
|
||||
} float32_bits;
|
||||
|
||||
float
|
||||
float16to32 (bfloat16_bits f16)
|
||||
{
|
||||
float32_bits f32;
|
||||
f32.bits.s = f16.bits.s;
|
||||
f32.bits.e = f16.bits.e;
|
||||
f32.bits.m = (uint32_t) f16.bits.m << 16;
|
||||
return f32.v;
|
||||
}
|
||||
|
||||
int
|
||||
main (int argc, char *argv[])
|
||||
{
|
||||
|
@ -55,8 +76,6 @@ main (int argc, char *argv[])
|
|||
int loop = 100;
|
||||
char transA = 'N', transB = 'N';
|
||||
float alpha = 1.0, beta = 0.0;
|
||||
char transa = 'N';
|
||||
char transb = 'N';
|
||||
|
||||
for (int x = 0; x <= loop; x++)
|
||||
{
|
||||
|
@ -65,30 +84,45 @@ main (int argc, char *argv[])
|
|||
float B[k * n];
|
||||
float C[m * n];
|
||||
bfloat16_bits AA[m * k], BB[k * n];
|
||||
float CC[m * n];
|
||||
float DD[m * n], CC[m * n];
|
||||
|
||||
for (int j = 0; j < m; j++)
|
||||
{
|
||||
for (int i = 0; i < m; i++)
|
||||
{
|
||||
A[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5;
|
||||
B[j * k + i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) + 0.5;
|
||||
A[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
||||
B[j * k + i] = ((FLOAT) rand () / (FLOAT) RAND_MAX) + 0.5;
|
||||
C[j * k + i] = 0;
|
||||
AA[j * k + i].v = *(uint32_t *) & A[j * k + i] >> 16;
|
||||
BB[j * k + i].v = *(uint32_t *) & B[j * k + i] >> 16;
|
||||
CC[j * k + i] = 0;
|
||||
DD[j * k + i] = 0;
|
||||
}
|
||||
}
|
||||
SGEMM (&transA, &transB, &m, &n, &k, &alpha, A,
|
||||
&m, B, &k, &beta, C, &m);
|
||||
&m, B, &k, &beta, C, &m);
|
||||
SHGEMM (&transA, &transB, &m, &n, &k, &alpha, AA,
|
||||
&m, BB, &k, &beta, CC, &m);
|
||||
|
||||
&m, BB, &k, &beta, CC, &m);
|
||||
for (i = 0; i < n; i++)
|
||||
for (j = 0; j < m; j++)
|
||||
for (l = 0; l < k; l++)
|
||||
if (fabs(CC[i * m + j]-C[i * m + j]) > 1.0)
|
||||
ret++;
|
||||
for (j = 0; j < m; j++)
|
||||
for (l = 0; l < k; l++)
|
||||
if (fabs (CC[i * m + j] - C[i * m + j]) > 1.0)
|
||||
ret++;
|
||||
if (transA == 'N' && transB == 'N')
|
||||
{
|
||||
for (i = 0; i < n; i++)
|
||||
for (j = 0; j < m; j++)
|
||||
for (l = 0; l < k; l++)
|
||||
{
|
||||
DD[i * m + j] +=
|
||||
float16to32 (AA[l * m + j]) * float16to32 (BB[l + k * i]);
|
||||
}
|
||||
for (i = 0; i < n; i++)
|
||||
for (j = 0; j < m; j++)
|
||||
for (l = 0; l < k; l++)
|
||||
if (CC[i * m + j] != DD[i * m + j])
|
||||
ret++;
|
||||
}
|
||||
}
|
||||
if (ret != 0)
|
||||
fprintf (stderr, "FATAL ERROR SHGEMM - Return code: %d\n", ret);
|
||||
|
|
Loading…
Reference in New Issue