Merge pull request #84 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-09-15 23:13:30 +02:00 committed by GitHub
commit 3843bd188c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
52 changed files with 1549 additions and 154 deletions

View File

@ -277,5 +277,10 @@ COMMON_PROF = -pg
# If you want to enable the experimental BFLOAT16 support # If you want to enable the experimental BFLOAT16 support
# BUILD_HALF = 1 # BUILD_HALF = 1
# #
# the below is not yet configurable, use cmake if you need to build only select types
BUILD_SINGLE = 1
BUILD_DOUBLE = 1
BUILD_COMPLEX = 1
BUILD_COMPLEX16 = 1
# End of user configuration # End of user configuration
# #

View File

@ -295,6 +295,7 @@ endif
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4)
GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4) GCCVERSIONGT4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 4)
GCCVERSIONEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` = 5)
GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5) GCCVERSIONGT5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \> 5)
GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7) GCCVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 7)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
@ -593,35 +594,33 @@ endif
ifeq ($(ARCH), zarch) ifeq ($(ARCH), zarch)
DYNAMIC_CORE = ZARCH_GENERIC DYNAMIC_CORE = ZARCH_GENERIC
# if the compiler accepts -march=arch11 or -march=z13 and can compile a file # Z13 is supported since gcc-5.2, gcc-6, and in RHEL 7.3 and newer
# with z13-specific inline assembly, then we can include support for Z13. ifeq ($(GCCVERSIONGT5), 1)
# note: -march=z13 is equivalent to -march=arch11 yet some compiler releases ZARCH_SUPPORT_Z13 := 1
# only support one or the other. else ifeq ($(GCCVERSIONEQ5), 1)
# note: LLVM version 6.x supported -march=z13 yet could not handle vector ifeq ($(GCCMINORVERSIONGTEQ2), 1)
# registers in inline assembly, so the check for supporting the -march flag is ZARCH_SUPPORT_Z13 := 1
# not enough. endif
ZARCH_TEST_COMPILE=-c $(TOPDIR)/kernel/zarch/damin_z13.c -I$(TOPDIR) -o /dev/null > /dev/null 2> /dev/null endif
ZARCH_CC_SUPPORTS_ARCH11=$(shell $(CC) -march=arch11 $(ZARCH_TEST_COMPILE) && echo 1)
ZARCH_CC_SUPPORTS_Z13=$(shell $(CC) -march=z13 $(ZARCH_TEST_COMPILE) && echo 1)
ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH11), $(ZARCH_CC_SUPPORTS_Z13)), 1) ifeq ($(wildcard /etc/redhat-release), /etc/redhat-release)
ifeq ($(shell source /etc/os-release ; expr $$VERSION_ID \>= "7.3"), 1)
ZARCH_SUPPORT_Z13 := 1
endif
endif
ifeq ($(ZARCH_SUPPORT_Z13), 1)
DYNAMIC_CORE += Z13 DYNAMIC_CORE += Z13
CCOMMON_OPT += -DDYN_Z13
else else
$(info OpenBLAS: Not building Z13 kernels because the compiler $(CC) does not support it) $(info OpenBLAS: Not building Z13 kernels because gcc is older than 5.2 or 6.x)
endif endif
# as above for z13, check for -march=arch12 and z14 support in the compiler. ifeq ($(GCCVERSIONGTEQ7), 1)
ZARCH_CC_SUPPORTS_ARCH12=$(shell $(CC) -march=arch12 $(ZARCH_TEST_COMPILE) && echo 1)
ZARCH_CC_SUPPORTS_Z14=$(shell $(CC) -march=z14 $(ZARCH_TEST_COMPILE) && echo 1)
ifeq ($(or $(ZARCH_CC_SUPPORTS_ARCH12), $(ZARCH_CC_SUPPORTS_Z14)), 1)
DYNAMIC_CORE += Z14 DYNAMIC_CORE += Z14
CCOMMON_OPT += -DDYN_Z14
else else
$(info OpenBLAS: Not building Z14 kernels because the compiler $(CC) does not support it) $(info OpenBLAS: Not building Z14 kernels because gcc is older than 7.x)
endif
endif endif
endif # ARCH zarch
ifeq ($(ARCH), power) ifeq ($(ARCH), power)
DYNAMIC_CORE = POWER6 DYNAMIC_CORE = POWER6
@ -1223,6 +1222,18 @@ endif
ifeq ($(BUILD_HALF), 1) ifeq ($(BUILD_HALF), 1)
CCOMMON_OPT += -DBUILD_HALF CCOMMON_OPT += -DBUILD_HALF
endif endif
ifeq ($(BUILD_SINGLE), 1)
CCOMMON_OPT += -DBUILD_SINGLE
endif
ifeq ($(BUILD_DOUBLE), 1)
CCOMMON_OPT += -DBUILD_DOUBLE
endif
ifeq ($(BUILD_COMPLEX), 1)
CCOMMON_OPT += -DBUILD_COMPLEX
endif
ifeq ($(BUILD_COMPLEX16), 1)
CCOMMON_OPT += -DBUILD_COMPLEX16
endif
CCOMMON_OPT += -DVERSION=\"$(VERSION)\" CCOMMON_OPT += -DVERSION=\"$(VERSION)\"

View File

@ -5,13 +5,14 @@ QBLASOBJS_P = $(QBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) CBLASOBJS_P = $(CBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) ZBLASOBJS_P = $(ZBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX)) XBLASOBJS_P = $(XBLASOBJS:.$(SUFFIX)=.$(PSUFFIX))
SHEXTOBJS_P = $(SHEXTOBJS:.$(SUFFIX)=.$(PSUFFIX))
COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX)) COMMONOBJS_P = $(COMMONOBJS:.$(SUFFIX)=.$(PSUFFIX))
HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX)) HPLOBJS_P = $(HPLOBJS:.$(SUFFIX)=.$(PSUFFIX))
BLASOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) BLASOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
BLASOBJS_P = $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P) BLASOBJS_P = $(SHEXTOBJS_P) $(SHBLASOBJS_P) $(SBLASOBJS_P) $(DBLASOBJS_P) $(CBLASOBJS_P) $(ZBLASOBJS_P)
ifdef EXPRECISION ifdef EXPRECISION
BLASOBJS += $(QBLASOBJS) $(XBLASOBJS) BLASOBJS += $(QBLASOBJS) $(XBLASOBJS)
@ -30,6 +31,7 @@ $(QBLASOBJS) $(QBLASOBJS_P) : override CFLAGS += -DXDOUBLE -UCOMPLEX
$(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX $(CBLASOBJS) $(CBLASOBJS_P) : override CFLAGS += -UDOUBLE -DCOMPLEX
$(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX $(ZBLASOBJS) $(ZBLASOBJS_P) : override CFLAGS += -DDOUBLE -DCOMPLEX
$(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX $(XBLASOBJS) $(XBLASOBJS_P) : override CFLAGS += -DXDOUBLE -DCOMPLEX
$(SHEXTOBJS) $(SHEXTOBJS_P) : override CFLAGS += -DHALF -UDOUBLE -UCOMPLEX
$(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SHBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(SBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
@ -38,6 +40,7 @@ $(QBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(CBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(ZBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF) $(XBLASOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
$(SHEXTOBJS_P) : override CFLAGS += -DPROFILE $(COMMON_PROF)
libs :: $(BLASOBJS) $(COMMONOBJS) libs :: $(BLASOBJS) $(COMMONOBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^

11
cblas.h
View File

@ -382,6 +382,17 @@ void cblas_cgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint
void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta, void cblas_zgeadd(OPENBLAS_CONST enum CBLAS_ORDER CORDER,OPENBLAS_CONST blasint crows, OPENBLAS_CONST blasint ccols, OPENBLAS_CONST double *calpha, double *a, OPENBLAS_CONST blasint clda, OPENBLAS_CONST double *cbeta,
double *c, OPENBLAS_CONST blasint cldc); double *c, OPENBLAS_CONST blasint cldc);
/*** BFLOAT16 and INT8 extensions ***/
/* convert float array to BFLOAT16 array by rounding */
void cblas_shstobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
/* convert double array to BFLOAT16 array by rounding */
void cblas_shdtobf16(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *in, OPENBLAS_CONST blasint incin, bfloat16 *out, OPENBLAS_CONST blasint incout);
/* convert BFLOAT16 array to float array */
void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, float *out, OPENBLAS_CONST blasint incout);
/* convert BFLOAT16 array to double array */
void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
/* dot production of BFLOAT16 input arrays, and output as float */
float cblas_shdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -126,12 +126,14 @@ if (BUILD_HALF)
set(SHAXPYKERNEL ../arm/axpy.c) set(SHAXPYKERNEL ../arm/axpy.c)
set(SHAXPBYKERNEL ../arm/axpby.c) set(SHAXPBYKERNEL ../arm/axpby.c)
set(SHCOPYKERNEL ../arm/copy.c) set(SHCOPYKERNEL ../arm/copy.c)
set(SHDOTKERNEL ../arm/dot.c) set(SHDOTKERNEL ../x86_64/shdot.c)
set(SHROTKERNEL ../arm/rot.c) set(SHROTKERNEL ../arm/rot.c)
set(SHSCALKERNEL ../arm/scal.c) set(SHSCALKERNEL ../arm/scal.c)
set(SHNRM2KERNEL ../arm/nrm2.c) set(SHNRM2KERNEL ../arm/nrm2.c)
set(SHSUMKERNEL ../arm/sum.c) set(SHSUMKERNEL ../arm/sum.c)
set(SHSWAPKERNEL ../arm/swap.c) set(SHSWAPKERNEL ../arm/swap.c)
set(TOBF16KERNEL ../x86_64/tobf16.c)
set(BF16TOKERNEL ../x86_64/bf16to.c)
endif () endif ()
endmacro () endmacro ()

View File

@ -393,6 +393,18 @@ set(REVISION "-r${OpenBLAS_VERSION}")
set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION}) set(MAJOR_VERSION ${OpenBLAS_MAJOR_VERSION})
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CCOMMON_OPT}") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${CCOMMON_OPT}")
if (BUILD_SINGLE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_SINGLE")
endif()
if (BUILD_DOUBLE)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_DOUBLE")
endif()
if (BUILD_COMPLEX)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX")
endif()
if (BUILD_COMPLEX16)
set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -DBUILD_COMPLEX16")
endif()
if(NOT MSVC) if(NOT MSVC)
set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}") set(CMAKE_ASM_FLAGS "${CMAKE_ASM_FLAGS} ${CCOMMON_OPT}")
endif() endif()

View File

@ -258,7 +258,8 @@ typedef unsigned long BLASULONG;
#endif #endif
#ifndef BFLOAT16 #ifndef BFLOAT16
typedef unsigned short bfloat16; #include <stdint.h>
typedef uint16_t bfloat16;
#define HALFCONVERSION 1 #define HALFCONVERSION 1
#endif #endif

View File

@ -54,6 +54,11 @@ double BLASFUNC(dsdot) (blasint *, float *, blasint *, float *, blasint *);
double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *); double BLASFUNC(ddot) (blasint *, double *, blasint *, double *, blasint *);
xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *); xdouble BLASFUNC(qdot) (blasint *, xdouble *, blasint *, xdouble *, blasint *);
float BLASFUNC(shdot) (blasint *, bfloat16 *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(shstobf16) (blasint *, float *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(shdtobf16) (blasint *, double *, blasint *, bfloat16 *, blasint *);
void BLASFUNC(sbf16tos) (blasint *, bfloat16 *, blasint *, float *, blasint *);
void BLASFUNC(dbf16tod) (blasint *, bfloat16 *, blasint *, double *, blasint *);
#ifdef RETURN_BY_STRUCT #ifdef RETURN_BY_STRUCT
typedef struct { typedef struct {

View File

@ -46,6 +46,12 @@ float sdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG);
double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG); double dsdot_k(BLASLONG, float *, BLASLONG, float *, BLASLONG);
double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG); double ddot_k(BLASLONG, double *, BLASLONG, double *, BLASLONG);
xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG); xdouble qdot_k(BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG);
float shdot_k(BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
void shstobf16_k(BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
void shdtobf16_k(BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
void sbf16tos_k (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
void dbf16tod_k (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG);
openblas_complex_float cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float cdotc_k (BLASLONG, float *, BLASLONG, float *, BLASLONG);
openblas_complex_float cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG); openblas_complex_float cdotu_k (BLASLONG, float *, BLASLONG, float *, BLASLONG);

View File

@ -646,6 +646,11 @@
#elif defined(HALF) #elif defined(HALF)
#define D_TO_BF16_K SHDTOBF16_K
#define D_BF16_TO_K DBF16TOD_K
#define S_TO_BF16_K SHSTOBF16_K
#define S_BF16_TO_K SBF16TOS_K
#define AMAX_K SAMAX_K #define AMAX_K SAMAX_K
#define AMIN_K SAMIN_K #define AMIN_K SAMIN_K
#define MAX_K SMAX_K #define MAX_K SMAX_K
@ -657,6 +662,7 @@
#define ASUM_K SASUM_K #define ASUM_K SASUM_K
#define DOTU_K SDOTU_K #define DOTU_K SDOTU_K
#define DOTC_K SDOTC_K #define DOTC_K SDOTC_K
#define BF16_DOT_K SHDOT_K
#define AXPYU_K SAXPYU_K #define AXPYU_K SAXPYU_K
#define AXPYC_K SAXPYC_K #define AXPYC_K SAXPYC_K
#define AXPBY_K SAXPBY_K #define AXPBY_K SAXPBY_K

View File

@ -51,6 +51,11 @@ typedef struct {
int shgemm_p, shgemm_q, shgemm_r; int shgemm_p, shgemm_q, shgemm_r;
int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn; int shgemm_unroll_m, shgemm_unroll_n, shgemm_unroll_mn;
void (*shstobf16_k) (BLASLONG, float *, BLASLONG, bfloat16 *, BLASLONG);
void (*shdtobf16_k) (BLASLONG, double *, BLASLONG, bfloat16 *, BLASLONG);
void (*sbf16tos_k) (BLASLONG, bfloat16 *, BLASLONG, float *, BLASLONG);
void (*dbf16tod_k) (BLASLONG, bfloat16 *, BLASLONG, double *, BLASLONG);
float (*shamax_k) (BLASLONG, float *, BLASLONG); float (*shamax_k) (BLASLONG, float *, BLASLONG);
float (*shamin_k) (BLASLONG, float *, BLASLONG); float (*shamin_k) (BLASLONG, float *, BLASLONG);
float (*shmax_k) (BLASLONG, float *, BLASLONG); float (*shmax_k) (BLASLONG, float *, BLASLONG);
@ -64,7 +69,7 @@ BLASLONG (*ishmin_k) (BLASLONG, float *, BLASLONG);
float (*shasum_k) (BLASLONG, float *, BLASLONG); float (*shasum_k) (BLASLONG, float *, BLASLONG);
float (*shsum_k) (BLASLONG, float *, BLASLONG); float (*shsum_k) (BLASLONG, float *, BLASLONG);
int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*shcopy_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
float (*shdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); float (*shdot_k) (BLASLONG, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG);
double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG); double (*dshdot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float); int (*shrot_k) (BLASLONG, float *, BLASLONG, float *, BLASLONG, float, float);

View File

@ -3,6 +3,12 @@
#ifndef DYNAMIC_ARCH #ifndef DYNAMIC_ARCH
#define SHDOT_K shdot_k
#define SHSTOBF16_K shstobf16_k
#define SHDTOBF16_K shdtobf16_k
#define SBF16TOS_K sbf16tos_k
#define DBF16TOD_K dbf16tod_k
#define SHGEMM_ONCOPY shgemm_oncopy #define SHGEMM_ONCOPY shgemm_oncopy
#define SHGEMM_OTCOPY shgemm_otcopy #define SHGEMM_OTCOPY shgemm_otcopy
@ -18,6 +24,12 @@
#else #else
#define SHDOT_K gotoblas -> shdot_k
#define SHSTOBF16_K gotoblas -> shstobf16_k
#define SHDTOBF16_K gotoblas -> shdtobf16_k
#define SBF16TOS_K gotoblas -> sbf16tos_k
#define DBF16TOD_K gotoblas -> dbf16tod_k
#define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy #define SHGEMM_ONCOPY gotoblas -> shgemm_oncopy
#define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy #define SHGEMM_OTCOPY gotoblas -> shgemm_otcopy
#define SHGEMM_INCOPY gotoblas -> shgemm_incopy #define SHGEMM_INCOPY gotoblas -> shgemm_incopy

View File

@ -59,12 +59,19 @@ extern int blas_omp_linked;
#define BLAS_PTHREAD 0x4000U #define BLAS_PTHREAD 0x4000U
#define BLAS_NODE 0x2000U #define BLAS_NODE 0x2000U
#define BLAS_PREC 0x0003U #define BLAS_PREC 0x000FU
#define BLAS_SINGLE 0x0000U #define BLAS_INT8 0x0000U
#define BLAS_DOUBLE 0x0001U #define BLAS_BFLOAT16 0x0001U
#define BLAS_XDOUBLE 0x0002U #define BLAS_SINGLE 0x0002U
#define BLAS_REAL 0x0000U #define BLAS_DOUBLE 0x0003U
#define BLAS_COMPLEX 0x0004U #define BLAS_XDOUBLE 0x0004U
#define BLAS_STOBF16 0x0008U
#define BLAS_DTOBF16 0x0009U
#define BLAS_BF16TOS 0x000AU
#define BLAS_BF16TOD 0x000BU
#define BLAS_REAL 0x0000U
#define BLAS_COMPLEX 0x1000U
#define BLAS_TRANSA 0x0030U /* 2bit */ #define BLAS_TRANSA 0x0030U /* 2bit */
#define BLAS_TRANSA_N 0x0000U #define BLAS_TRANSA_N 0x0000U

View File

@ -142,6 +142,29 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
#endif #endif
} }
static __inline void cpuid_count(int op, int count, int *eax, int *ebx, int *ecx, int *edx)
{
#ifdef C_MSVC
int cpuInfo[4] = {-1};
__cpuidex(cpuInfo, op, count);
*eax = cpuInfo[0];
*ebx = cpuInfo[1];
*ecx = cpuInfo[2];
*edx = cpuInfo[3];
#else
#if defined(__i386__) && defined(__PIC__)
__asm__ __volatile__
("mov %%ebx, %%edi;"
"cpuid;"
"xchgl %%ebx, %%edi;"
: "=a" (*eax), "=D" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "2" (count) : "cc");
#else
__asm__ __volatile__
("cpuid": "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (op), "2" (count) : "cc");
#endif
#endif
}
/* /*
#define WHEREAMI #define WHEREAMI
*/ */

View File

@ -74,16 +74,6 @@ void F77_dswap( const int *N, double *X, const int *incX,
return; return;
} }
double F77_dzasum(const int *N, void *X, const int *incX)
{
return cblas_dzasum(*N, X, *incX);
}
double F77_dznrm2(const int *N, OPENBLAS_CONST void *X, const int *incX)
{
return cblas_dznrm2(*N, X, *incX);
}
int F77_idamax(const int *N, OPENBLAS_CONST double *X, const int *incX) int F77_idamax(const int *N, OPENBLAS_CONST double *X, const int *incX)
{ {
if (*N < 1 || *incX < 1) return(0); if (*N < 1 || *incX < 1) return(0);

View File

@ -21,16 +21,6 @@ void F77_saxpy(blasint *N, const float *alpha, OPENBLAS_CONST float *X,
return; return;
} }
float F77_scasum(blasint *N, float *X, blasint *incX)
{
return cblas_scasum(*N, X, *incX);
}
float F77_scnrm2(blasint *N, OPENBLAS_CONST float *X, blasint *incX)
{
return cblas_scnrm2(*N, X, *incX);
}
void F77_scopy(blasint *N, OPENBLAS_CONST float *X, blasint *incX, void F77_scopy(blasint *N, OPENBLAS_CONST float *X, blasint *incX,
float *Y, blasint *incY) float *Y, blasint *incY)
{ {

View File

@ -139,7 +139,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif
@ -209,7 +209,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif
@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif
@ -374,7 +374,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif

View File

@ -126,7 +126,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif
@ -150,7 +150,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif
@ -207,7 +207,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif
@ -262,7 +262,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif
@ -287,7 +287,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif
@ -348,7 +348,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
/* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */ /* the current AVX512 s/d/c/z GEMM kernel requires n>=6*GEMM_UNROLL_N to achieve the best performance */
if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N; if (min_jj >= 6*GEMM_UNROLL_N) min_jj = 6*GEMM_UNROLL_N;
#else #else
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
#endif #endif

View File

@ -131,7 +131,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@ -197,7 +197,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

View File

@ -126,7 +126,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@ -182,7 +182,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){ for(jjs = 0; jjs < min_j - min_l - ls + js; jjs += min_jj){
min_jj = min_j - min_l - ls + js - jjs; min_jj = min_j - min_l - ls + js - jjs;
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@ -243,7 +243,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = js; jjs < js + min_j; jjs += min_jj){ for(jjs = js; jjs < js + min_j; jjs += min_jj){
min_jj = min_j + js - jjs; min_jj = min_j + js - jjs;
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;
@ -304,7 +304,7 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, FLO
for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){ for(jjs = 0; jjs < min_j - js + ls; jjs += min_jj){
min_jj = min_j - js + ls - jjs; min_jj = min_j - js + ls - jjs;
if (min_jj > GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3; if (min_jj >= GEMM_UNROLL_N*3) min_jj = GEMM_UNROLL_N*3;
else else
if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N;

View File

@ -49,9 +49,36 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
blas_arg_t args [MAX_CPU_NUMBER]; blas_arg_t args [MAX_CPU_NUMBER];
BLASLONG i, width, astride, bstride; BLASLONG i, width, astride, bstride;
int num_cpu, calc_type; int num_cpu, calc_type_a, calc_type_b;
calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; switch (mode & BLAS_PREC) {
case BLAS_INT8 :
case BLAS_BFLOAT16:
case BLAS_SINGLE :
case BLAS_DOUBLE :
case BLAS_XDOUBLE :
calc_type_a = calc_type_b = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0);
break;
case BLAS_STOBF16 :
calc_type_a = 2 + ((mode & BLAS_COMPLEX) != 0);
calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0);
break;
case BLAS_DTOBF16 :
calc_type_a = 3 + ((mode & BLAS_COMPLEX) != 0);
calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0);
break;
case BLAS_BF16TOS :
calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0);
calc_type_b = 2 + ((mode & BLAS_COMPLEX) != 0);
break;
case BLAS_BF16TOD :
calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0);
calc_type_b = 3 + ((mode & BLAS_COMPLEX) != 0);
break;
default:
calc_type_a = calc_type_b = 0;
break;
}
mode |= BLAS_LEGACY; mode |= BLAS_LEGACY;
@ -77,8 +104,8 @@ int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha
bstride = width; bstride = width;
} }
astride <<= calc_type; astride <<= calc_type_a;
bstride <<= calc_type; bstride <<= calc_type_b;
args[num_cpu].m = width; args[num_cpu].m = width;
args[num_cpu].n = n; args[num_cpu].n = n;
@ -120,9 +147,36 @@ int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASL
blas_arg_t args [MAX_CPU_NUMBER]; blas_arg_t args [MAX_CPU_NUMBER];
BLASLONG i, width, astride, bstride; BLASLONG i, width, astride, bstride;
int num_cpu, calc_type; int num_cpu, calc_type_a, calc_type_b;
calc_type = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0) + 2; switch (mode & BLAS_PREC) {
case BLAS_INT8 :
case BLAS_BFLOAT16:
case BLAS_SINGLE :
case BLAS_DOUBLE :
case BLAS_XDOUBLE :
calc_type_a = calc_type_b = (mode & BLAS_PREC) + ((mode & BLAS_COMPLEX) != 0);
break;
case BLAS_STOBF16 :
calc_type_a = 2 + ((mode & BLAS_COMPLEX) != 0);
calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0);
break;
case BLAS_DTOBF16 :
calc_type_a = 3 + ((mode & BLAS_COMPLEX) != 0);
calc_type_b = 1 + ((mode & BLAS_COMPLEX) != 0);
break;
case BLAS_BF16TOS :
calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0);
calc_type_b = 2 + ((mode & BLAS_COMPLEX) != 0);
break;
case BLAS_BF16TOD :
calc_type_a = 1 + ((mode & BLAS_COMPLEX) != 0);
calc_type_b = 3 + ((mode & BLAS_COMPLEX) != 0);
break;
default:
calc_type_a = calc_type_b = 0;
break;
}
mode |= BLAS_LEGACY; mode |= BLAS_LEGACY;
@ -148,8 +202,8 @@ int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASL
bstride = width; bstride = width;
} }
astride <<= calc_type; astride <<= calc_type_a;
bstride <<= calc_type; bstride <<= calc_type_b;
args[num_cpu].m = width; args[num_cpu].m = width;
args[num_cpu].n = n; args[num_cpu].n = n;

View File

@ -192,7 +192,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (!(mode & BLAS_COMPLEX)){ if (!(mode & BLAS_COMPLEX)){
#ifdef EXPRECISION #ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){ if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
/* REAL / Extended Double */ /* REAL / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG,
@ -205,7 +205,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else } else
#endif #endif
if (mode & BLAS_DOUBLE){ if ((mode & BLAS_PREC) == BLAS_DOUBLE){
/* REAL / Double */ /* REAL / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG,
@ -216,21 +216,58 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else { } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
/* REAL / Single */ /* REAL / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG,
float *, BLASLONG, void *) = func; float *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k, afunc(args -> m, args -> n, args -> k,
((float *)args -> alpha)[0], ((float *)args -> alpha)[0],
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
#ifdef BUILD_HALF
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
/* REAL / BFLOAT16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
bfloat16 *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((bfloat16 *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_STOBF16){
/* REAL / BLAS_STOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, bfloat16 *, BLASLONG,
float *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((float *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
/* REAL / BLAS_DTOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, bfloat16 *, BLASLONG,
double *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((double *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
#endif
} else {
/* REAL / Other types in future */
} }
} else { } else {
#ifdef EXPRECISION #ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){ if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
/* COMPLEX / Extended Double */ /* COMPLEX / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG,
@ -244,7 +281,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else } else
#endif #endif
if (mode & BLAS_DOUBLE){ if ((mode & BLAS_PREC) == BLAS_DOUBLE) {
/* COMPLEX / Double */ /* COMPLEX / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG,
@ -256,7 +293,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else { } else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
/* COMPLEX / Single */ /* COMPLEX / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG,
@ -268,7 +305,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} } else {
/* COMPLEX / Other types in future */
}
} }
} }
@ -414,33 +453,37 @@ blas_queue_t *tscq;
if (sb == NULL) { if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){ if (!(queue -> mode & BLAS_COMPLEX)){
#ifdef EXPRECISION #ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else } else
#endif #endif
if (queue -> mode & BLAS_DOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE) {
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else { } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} } else {
/* Other types in future */
}
} else { } else {
#ifdef EXPRECISION #ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else } else
#endif #endif
if (queue -> mode & BLAS_DOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else { } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} } else {
/* Other types in future */
}
} }
queue->sb=sb; queue->sb=sb;
} }

View File

@ -142,7 +142,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (!(mode & BLAS_COMPLEX)){ if (!(mode & BLAS_COMPLEX)){
#ifdef EXPRECISION #ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){ if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
/* REAL / Extended Double */ /* REAL / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG,
@ -155,7 +155,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else } else
#endif #endif
if (mode & BLAS_DOUBLE){ if ((mode & BLAS_PREC) == BLAS_DOUBLE){
/* REAL / Double */ /* REAL / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG,
@ -166,7 +166,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else { } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
/* REAL / Single */ /* REAL / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG,
@ -177,10 +177,47 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
#ifdef BUILD_HALF
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
/* REAL / BFLOAT16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
bfloat16 *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((bfloat16 *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_STOBF16){
/* REAL / BLAS_STOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, bfloat16 *, BLASLONG,
float *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((float *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
/* REAL / BLAS_DTOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, bfloat16 *, BLASLONG,
double *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((double *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
#endif
} else {
/* REAL / Other types in future */
} }
} else { } else {
#ifdef EXPRECISION #ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){ if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
/* COMPLEX / Extended Double */ /* COMPLEX / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG,
@ -194,7 +231,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else } else
#endif #endif
if (mode & BLAS_DOUBLE){ if ((mode & BLAS_PREC) == BLAS_DOUBLE){
/* COMPLEX / Double */ /* COMPLEX / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG,
@ -206,7 +243,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else { } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
/* COMPLEX / Single */ /* COMPLEX / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG,
@ -218,8 +255,10 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} } else {
} /* COMPLEX / Other types in future */
}
}
} }
static void exec_threads(blas_queue_t *queue, int buf_index){ static void exec_threads(blas_queue_t *queue, int buf_index){
@ -255,32 +294,36 @@ static void exec_threads(blas_queue_t *queue, int buf_index){
if (sb == NULL) { if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){ if (!(queue -> mode & BLAS_COMPLEX)){
#ifdef EXPRECISION #ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else } else
#endif #endif
if (queue -> mode & BLAS_DOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else { } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE){
sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else {
/* Other types in future */
} }
} else { } else {
#ifdef EXPRECISION #ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else } else
#endif #endif
if (queue -> mode & BLAS_DOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else { } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else {
/* Other types in future */
} }
} }
queue->sb=sb; queue->sb=sb;

View File

@ -77,7 +77,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (!(mode & BLAS_COMPLEX)){ if (!(mode & BLAS_COMPLEX)){
#ifdef EXPRECISION #ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){ if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
/* REAL / Extended Double */ /* REAL / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG,
@ -90,7 +90,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else } else
#endif #endif
if (mode & BLAS_DOUBLE){ if ((mode & BLAS_PREC) == BLAS_DOUBLE){
/* REAL / Double */ /* REAL / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG,
@ -101,7 +101,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else { } else if ((mode & BLAS_PREC) == BLAS_SINGLE){
/* REAL / Single */ /* REAL / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG,
@ -112,10 +112,47 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
#ifdef BUILD_HALF
} else if ((mode & BLAS_PREC) == BLAS_BFLOAT16){
/* REAL / BFLOAT16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, bfloat16,
bfloat16 *, BLASLONG, bfloat16 *, BLASLONG,
bfloat16 *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((bfloat16 *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_STOBF16){
/* REAL / BLAS_STOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, bfloat16 *, BLASLONG,
float *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((float *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else if ((mode & BLAS_PREC) == BLAS_DTOBF16){
/* REAL / BLAS_DTOBF16 */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, bfloat16 *, BLASLONG,
double *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((double *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
#endif
} else {
/* REAL / Other types in future */
} }
} else { } else {
#ifdef EXPRECISION #ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){ if ((mode & BLAS_PREC) == BLAS_XDOUBLE){
/* COMPLEX / Extended Double */ /* COMPLEX / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG,
@ -129,7 +166,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else } else
#endif #endif
if (mode & BLAS_DOUBLE){ if ((mode & BLAS_PREC) == BLAS_DOUBLE){
/* COMPLEX / Double */ /* COMPLEX / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG,
@ -141,7 +178,7 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} else { } else if ((mode & BLAS_PREC) == BLAS_SINGLE) {
/* COMPLEX / Single */ /* COMPLEX / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float, void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG,
@ -153,7 +190,9 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
args -> a, args -> lda, args -> a, args -> lda,
args -> b, args -> ldb, args -> b, args -> ldb,
args -> c, args -> ldc, sb); args -> c, args -> ldc, sb);
} } else {
/* COMPLEX / Other types in future */
}
} }
} }
@ -233,32 +272,36 @@ static DWORD WINAPI blas_thread_server(void *arg){
if (sb == NULL) { if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){ if (!(queue -> mode & BLAS_COMPLEX)){
#ifdef EXPRECISION #ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble) sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else } else
#endif #endif
if (queue -> mode & BLAS_DOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double) sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else { } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float) sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else {
/* Other types in future */
} }
} else { } else {
#ifdef EXPRECISION #ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble) sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else } else
#endif #endif
if (queue -> mode & BLAS_DOUBLE){ if ((queue -> mode & BLAS_PREC) == BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double) sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else { } else if ((queue -> mode & BLAS_PREC) == BLAS_SINGLE) {
sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float) sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else {
/* Other types in future */
} }
} }
queue->sb=sb; queue->sb=sb;

View File

@ -207,6 +207,19 @@ extern gotoblas_t gotoblas_SKYLAKEX;
#else #else
#define gotoblas_SKYLAKEX gotoblas_PRESCOTT #define gotoblas_SKYLAKEX gotoblas_PRESCOTT
#endif #endif
#ifdef DYN_COOPERLAKE
extern gotoblas_t gotoblas_COOPERLAKE;
#elif defined(DYN_SKYLAKEX)
#define gotoblas_COOPERLAKE gotoblas_SKYLAKEX
#elif defined(DYN_HASWELL)
#define gotoblas_COOPERLAKE gotoblas_HASWELL
#elif defined(DYN_SANDYBRIDGE)
#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
#elif defined(DYN_NEHALEM)
#define gotoblas_COOPERLAKE gotoblas_NEHALEM
#else
#define gotoblas_COOPERLAKE gotoblas_PRESCOTT
#endif
#else // not DYNAMIC_LIST #else // not DYNAMIC_LIST
@ -247,14 +260,17 @@ extern gotoblas_t gotoblas_EXCAVATOR;
#ifdef NO_AVX2 #ifdef NO_AVX2
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE #define gotoblas_HASWELL gotoblas_SANDYBRIDGE
#define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
#define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
#define gotoblas_ZEN gotoblas_SANDYBRIDGE #define gotoblas_ZEN gotoblas_SANDYBRIDGE
#else #else
extern gotoblas_t gotoblas_HASWELL; extern gotoblas_t gotoblas_HASWELL;
extern gotoblas_t gotoblas_ZEN; extern gotoblas_t gotoblas_ZEN;
#ifndef NO_AVX512 #ifndef NO_AVX512
extern gotoblas_t gotoblas_SKYLAKEX; extern gotoblas_t gotoblas_SKYLAKEX;
extern gotoblas_t gotoblas_COOPERLAKE;
#else #else
#define gotoblas_SKYLAKEX gotoblas_HASWELL #define gotoblas_SKYLAKEX gotoblas_HASWELL
#define gotoblas_COOPERLAKE gotoblas_HASWELL
#endif #endif
#endif #endif
#else #else
@ -262,6 +278,7 @@ extern gotoblas_t gotoblas_SKYLAKEX;
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_HASWELL gotoblas_NEHALEM
#define gotoblas_SKYLAKEX gotoblas_NEHALEM #define gotoblas_SKYLAKEX gotoblas_NEHALEM
#define gotoblas_COOPERLAKE gotoblas_NEHALEM
#define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA
#define gotoblas_STEAMROLLER gotoblas_BARCELONA #define gotoblas_STEAMROLLER gotoblas_BARCELONA
@ -343,6 +360,23 @@ int support_avx512(){
#endif #endif
} }
int support_avx512_bf16(){
#if !defined(NO_AVX) && !defined(NO_AVX512)
int eax, ebx, ecx, edx;
int ret=0;
if (!support_avx512())
return 0;
cpuid_count(7, 1, &eax, &ebx, &ecx, &edx);
if((eax & 32) == 32){
ret=1; // CPUID.7.1:EAX[bit 5] indicates whether avx512_bf16 supported or not
}
return ret;
#else
return 0;
#endif
}
extern void openblas_warning(int verbose, const char * msg); extern void openblas_warning(int verbose, const char * msg);
#define FALLBACK_VERBOSE 1 #define FALLBACK_VERBOSE 1
#define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n" #define NEHALEM_FALLBACK "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"
@ -524,7 +558,10 @@ static gotoblas_t *get_coretype(void){
return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels.
} }
} }
if (model == 5) { if (model == 5) {
// Intel Cooperlake
if(support_avx512_bf16())
return &gotoblas_COOPERLAKE;
// Intel Skylake X // Intel Skylake X
if (support_avx512()) if (support_avx512())
return &gotoblas_SKYLAKEX; return &gotoblas_SKYLAKEX;
@ -774,7 +811,8 @@ static char *corename[] = {
"Steamroller", "Steamroller",
"Excavator", "Excavator",
"Zen", "Zen",
"SkylakeX" "SkylakeX",
"Cooperlake"
}; };
char *gotoblas_corename(void) { char *gotoblas_corename(void) {
@ -838,6 +876,7 @@ char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_EXCAVATOR) return corename[22]; if (gotoblas == &gotoblas_EXCAVATOR) return corename[22];
if (gotoblas == &gotoblas_ZEN) return corename[23]; if (gotoblas == &gotoblas_ZEN) return corename[23];
if (gotoblas == &gotoblas_SKYLAKEX) return corename[24]; if (gotoblas == &gotoblas_SKYLAKEX) return corename[24];
if (gotoblas == &gotoblas_COOPERLAKE) return corename[25];
return corename[0]; return corename[0];
} }
@ -868,6 +907,7 @@ static gotoblas_t *force_coretype(char *coretype){
switch (found) switch (found)
{ {
case 25: return (&gotoblas_COOPERLAKE);
case 24: return (&gotoblas_SKYLAKEX); case 24: return (&gotoblas_SKYLAKEX);
case 23: return (&gotoblas_ZEN); case 23: return (&gotoblas_ZEN);
case 22: return (&gotoblas_EXCAVATOR); case 22: return (&gotoblas_EXCAVATOR);

View File

@ -46,7 +46,7 @@
ssum, dsum, scsum, dzsum ssum, dsum, scsum, dzsum
); );
@halfblasobjs = (shgemm); @halfblasobjs = (shgemm, shdot, shstobf16, shdtobf16, sbf16tos, dbf16tod);
@cblasobjs = ( @cblasobjs = (
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@ -84,7 +84,7 @@
cblas_xerbla cblas_xerbla
); );
@halfcblasobjs = (cblas_shgemm); @halfcblasobjs = (cblas_shgemm, cblas_shdot, cblas_shstobf16, cblas_shdtobf16, cblas_sbf16tos, cblas_dbf16tod);
@exblasobjs = ( @exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,

View File

@ -69,7 +69,7 @@ if ($compiler eq "") {
$bu = "_"; $bu = "_";
} }
if ($data =~ /GNU/) { if ($data =~ /GNU/ || $data =~ /GCC/ ) {
$data =~ /(\d+)\.(\d+).(\d+)/; $data =~ /(\d+)\.(\d+).(\d+)/;
$major = $1; $major = $1;

View File

@ -47,7 +47,9 @@ SBLAS3OBJS = \
sgeadd.$(SUFFIX) sgeadd.$(SUFFIX)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_HALF),1)
SHBLAS1OBJS = shdot.$(SUFFIX)
SHBLAS3OBJS = shgemm.$(SUFFIX) SHBLAS3OBJS = shgemm.$(SUFFIX)
SHEXTOBJS = shstobf16.$(SUFFIX) shdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
endif endif
DBLAS1OBJS = \ DBLAS1OBJS = \
@ -281,7 +283,9 @@ CSBLAS3OBJS = \
cblas_sgeadd.$(SUFFIX) cblas_sgeadd.$(SUFFIX)
ifeq ($(BUILD_HALF),1) ifeq ($(BUILD_HALF),1)
CSHBLAS1OBJS = cblas_shdot.$(SUFFIX)
CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX) CSHBLAS3OBJS = cblas_shgemm.$(SUFFIX)
CSHEXTOBJS = cblas_shstobf16.$(SUFFIX) cblas_shdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
endif endif
CDBLAS1OBJS = \ CDBLAS1OBJS = \
@ -374,6 +378,7 @@ override CFLAGS += -I.
SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS1OBJS += $(CSBLAS1OBJS)
SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS2OBJS += $(CSBLAS2OBJS)
SBLAS3OBJS += $(CSBLAS3OBJS) SBLAS3OBJS += $(CSBLAS3OBJS)
SHBLAS1OBJS += $(CSHBLAS1OBJS)
SHBLAS3OBJS += $(CSHBLAS3OBJS) SHBLAS3OBJS += $(CSHBLAS3OBJS)
DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS1OBJS += $(CDBLAS1OBJS)
DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS2OBJS += $(CDBLAS2OBJS)
@ -385,10 +390,11 @@ ZBLAS1OBJS += $(CZBLAS1OBJS)
ZBLAS2OBJS += $(CZBLAS2OBJS) ZBLAS2OBJS += $(CZBLAS2OBJS)
ZBLAS3OBJS += $(CZBLAS3OBJS) ZBLAS3OBJS += $(CZBLAS3OBJS)
SHEXTOBJS += $(CSHEXTOBJS)
endif endif
SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
SHBLASOBJS = $(SHBLAS3OBJS) SHBLASOBJS = $(SHBLAS1OBJS) $(SHBLAS3OBJS)
DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
@ -463,7 +469,7 @@ ZBLASOBJS += $(ZLAPACKOBJS)
endif endif
FUNCOBJS = $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) FUNCOBJS = $(SHEXTOBJS) $(SHBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
ifdef EXPRECISION ifdef EXPRECISION
FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS)
@ -491,7 +497,7 @@ endif
clean :: clean ::
@rm -f functable.h @rm -f functable.h
level1 : $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) level1 : $(BEXTOBJS) $(SHBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
@ -725,6 +731,19 @@ sdsdot.$(SUFFIX) sdsdot.$(PSUFFIX) : sdsdot.c
dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c dsdot.$(SUFFIX) dsdot.$(PSUFFIX) : dsdot.c
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)
ifeq ($(BUILD_HALF),1)
shdot.$(SUFFIX) shdot.$(PSUFFIX) : bf16dot.c
$(CC) $(CFLAGS) -c $< -o $(@F)
shstobf16.$(SUFFIX) shstobf16.$(PSUFFIX) : tobf16.c
$(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F)
shdtobf16.$(SUFFIX) shdtobf16.$(PSUFFIX) : tobf16.c
$(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F)
sbf16tos.$(SUFFIX) sbf16tos.$(PSUFFIX) : bf16to.c
$(CC) $(CFLAGS) -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F)
dbf16tod.$(SUFFIX) dbf16tod.$(PSUFFIX) : bf16to.c
$(CC) $(CFLAGS) -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F)
endif
sdot.$(SUFFIX) sdot.$(PSUFFIX) : dot.c sdot.$(SUFFIX) sdot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -c $< -o $(@F) $(CC) $(CFLAGS) -c $< -o $(@F)
@ -1463,6 +1482,19 @@ cblas_sdsdot.$(SUFFIX) cblas_sdsdot.$(PSUFFIX) : sdsdot.c
cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c cblas_dsdot.$(SUFFIX) cblas_dsdot.$(PSUFFIX) : dsdot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
ifeq ($(BUILD_HALF),1)
cblas_shdot.$(SUFFIX) cblas_shdot.$(PSUFFIX) : bf16dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)
cblas_shstobf16.$(SUFFIX) cblas_shstobf16.$(PSUFFIX) : tobf16.c
$(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F)
cblas_shdtobf16.$(SUFFIX) cblas_shdtobf16.$(PSUFFIX) : tobf16.c
$(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F)
cblas_sbf16tos.$(SUFFIX) cblas_sbf16tos.$(PSUFFIX) : bf16to.c
$(CC) $(CFLAGS) -DCBLAS -DSINGLE_PREC -UDOUBLE_PREC -c $< -o $(@F)
cblas_dbf16tod.$(SUFFIX) cblas_dbf16tod.$(PSUFFIX) : bf16to.c
$(CC) $(CFLAGS) -DCBLAS -USINGLE_PREC -DDOUBLE_PREC -c $< -o $(@F)
endif
cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c cblas_sdot.$(SUFFIX) cblas_sdot.$(PSUFFIX) : dot.c
$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

52
interface/bf16dot.c Normal file
View File

@ -0,0 +1,52 @@
#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#ifndef CBLAS
float NAME(blasint *N, bfloat16 *x, blasint *INCX, bfloat16 *y, blasint *INCY){
BLASLONG n = *N;
BLASLONG incx = *INCX;
BLASLONG incy = *INCY;
float ret;
PRINT_DEBUG_NAME;
if (n <= 0) return 0.;
IDEBUG_START;
FUNCTION_PROFILE_START();
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;
ret = BF16_DOT_K(n, x, incx, y, incy);
FUNCTION_PROFILE_END(1, 2 * n, 2 * n);
IDEBUG_END;
return ret;
}
#else
float CNAME(blasint n, bfloat16 *x, blasint incx, bfloat16 *y, blasint incy){
float ret;
PRINT_DEBUG_CNAME;
if (n <= 0) return 0.;
IDEBUG_START;
FUNCTION_PROFILE_START();
if (incx < 0) x -= (n - 1) * incx;
if (incy < 0) y -= (n - 1) * incy;
ret = BF16_DOT_K(n, x, incx, y, incy);
FUNCTION_PROFILE_END(1, 2 * n, 2 * n);
IDEBUG_END;
return ret;
}
#endif

62
interface/bf16to.c Normal file
View File

@ -0,0 +1,62 @@
#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#if defined(DOUBLE_PREC)
#define FLOAT_TYPE double
#elif defined(SINGLE_PREC)
#define FLOAT_TYPE float
#else
#endif
#ifndef CBLAS
void NAME(blasint *N, bfloat16 *in, blasint *INC_IN, FLOAT_TYPE *out, blasint *INC_OUT){
BLASLONG n = *N;
BLASLONG inc_in = *INC_IN;
BLASLONG inc_out = *INC_OUT;
PRINT_DEBUG_NAME;
if (n <= 0) return;
IDEBUG_START;
FUNCTION_PROFILE_START();
if (inc_in < 0) in -= (n - 1) * inc_in;
if (inc_out < 0) out -= (n - 1) * inc_out;
#if defined(DOUBLE_PREC)
D_BF16_TO_K(n, in, inc_in, out, inc_out);
#elif defined(SINGLE_PREC)
S_BF16_TO_K(n, in, inc_in, out, inc_out);
#else
#endif
FUNCTION_PROFILE_END(1, 2 * n, 2 * n);
IDEBUG_END;
}
#else
void CNAME(blasint n, bfloat16 * in, blasint inc_in, FLOAT_TYPE * out, blasint inc_out){
PRINT_DEBUG_CNAME;
if (n <= 0) return;
IDEBUG_START;
FUNCTION_PROFILE_START();
if (inc_in < 0) in -= (n - 1) * inc_in;
if (inc_out < 0) out -= (n - 1) * inc_out;
#if defined(DOUBLE_PREC)
D_BF16_TO_K(n, in, inc_in, out, inc_out);
#elif defined(SINGLE_PREC)
S_BF16_TO_K(n, in, inc_in, out, inc_out);
#else
#endif
FUNCTION_PROFILE_END(1, 2 * n, 2 * n);
IDEBUG_END;
}
#endif

61
interface/tobf16.c Normal file
View File

@ -0,0 +1,61 @@
#include <stdio.h>
#include "common.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#if defined(DOUBLE_PREC)
#define FLOAT_TYPE double
#elif defined(SINGLE_PREC)
#define FLOAT_TYPE float
#else
#endif
#ifndef CBLAS
void NAME(blasint *N, FLOAT_TYPE *in, blasint *INC_IN, bfloat16 *out, blasint *INC_OUT){
BLASLONG n = *N;
BLASLONG inc_in = *INC_IN;
BLASLONG inc_out = *INC_OUT;
PRINT_DEBUG_NAME;
if (n <= 0) return;
IDEBUG_START;
FUNCTION_PROFILE_START();
if (inc_in < 0) in -= (n - 1) * inc_in;
if (inc_out < 0) out -= (n - 1) * inc_out;
#if defined(DOUBLE_PREC)
D_TO_BF16_K(n, in, inc_in, out, inc_out);
#elif defined(SINGLE_PREC)
S_TO_BF16_K(n, in, inc_in, out, inc_out);
#else
#endif
FUNCTION_PROFILE_END(1, 2 * n, 2 * n);
IDEBUG_END;
}
#else
void CNAME(blasint n, FLOAT_TYPE *in, blasint inc_in, bfloat16 *out, blasint inc_out){
PRINT_DEBUG_CNAME;
if (n <= 0) return;
IDEBUG_START;
FUNCTION_PROFILE_START();
if (inc_in < 0) in -= (n - 1) * inc_in;
if (inc_out < 0) out -= (n - 1) * inc_out;
#if defined(DOUBLE_PREC)
D_TO_BF16_K(n, in, inc_in, out, inc_out);
#elif defined(SINGLE_PREC)
S_TO_BF16_K(n, in, inc_in, out, inc_out);
#endif
FUNCTION_PROFILE_END(1, 2 * n, 2 * n);
IDEBUG_END;
}
#endif

View File

@ -262,6 +262,20 @@ ifndef XDOTKERNEL
XDOTKERNEL = zdot.S XDOTKERNEL = zdot.S
endif endif
ifeq ($(BUILD_HALF),1)
ifndef SHDOTKERNEL
SHDOTKERNEL = ../x86_64/shdot.c
endif
ifndef TOBF16KERNEL
TOBF16KERNEL = ../x86_64/tobf16.c
endif
ifndef BF16TOKERNEL
BF16TOKERNEL = ../x86_64/bf16to.c
endif
endif
### NRM2 ### ### NRM2 ###
ifndef SNRM2KERNEL ifndef SNRM2KERNEL
@ -516,6 +530,15 @@ XBLASOBJS += \
xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \ xdotc_k$(TSUFFIX).$(SUFFIX) xdotu_k$(TSUFFIX).$(SUFFIX) xnrm2_k$(TSUFFIX).$(SUFFIX) xqrot_k$(TSUFFIX).$(SUFFIX) \
xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX) xscal_k$(TSUFFIX).$(SUFFIX) xswap_k$(TSUFFIX).$(SUFFIX) xsum_k$(TSUFFIX).$(SUFFIX)
ifeq ($(BUILD_HALF),1)
SHBLASOBJS += \
shdot_k$(TSUFFIX).$(SUFFIX)
SHEXTOBJS += \
shstobf16_k$(TSUFFIX).$(SUFFIX) shdtobf16_k$(TSUFFIX).$(SUFFIX)
SHEXTOBJS += \
sbf16tos_k$(TSUFFIX).$(SUFFIX) dbf16tod_k$(TSUFFIX).$(SUFFIX)
endif
### AMAX ### ### AMAX ###
@ -734,6 +757,19 @@ $(KDIR)ddot_k$(TSUFFIX).$(SUFFIX) $(KDIR)ddot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNEL
$(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL) $(KDIR)qdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)qdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(QDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -DXDOUBLE $< -o $@
ifeq ($(BUILD_HALF),1)
$(KDIR)shdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)shdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SHDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
$(KDIR)shstobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
$(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@
$(KDIR)shdtobf16_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(TOBF16KERNEL)
$(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@
$(KDIR)sbf16tos_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL)
$(CC) -c $(CFLAGS) -UDOUBLE -DSINGLE $< -o $@
$(KDIR)dbf16tod_k$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(BF16TOKERNEL)
$(CC) -c $(CFLAGS) -DDOUBLE -USINGLE $< -o $@
endif
$(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL) $(KDIR)sdot_k$(TSUFFIX).$(SUFFIX) $(KDIR)sdot_k$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SDOTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@ $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE $< -o $@

View File

@ -62,9 +62,11 @@ gotoblas_t TABLE_NAME = {
MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N), MAX(SHGEMM_DEFAULT_UNROLL_M, SHGEMM_DEFAULT_UNROLL_N),
#endif #endif
shstobf16_kTS, shdtobf16_kTS, sbf16tos_kTS, dbf16tod_kTS,
samax_kTS, samin_kTS, smax_kTS, smin_kTS, samax_kTS, samin_kTS, smax_kTS, smin_kTS,
isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS, isamax_kTS, isamin_kTS, ismax_kTS, ismin_kTS,
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sdot_kTS, snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, shdot_kTS,
dsdot_kTS, dsdot_kTS,
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
sgemv_nTS, sgemv_tTS, sger_kTS, sgemv_nTS, sgemv_tTS, sger_kTS,

View File

@ -146,6 +146,18 @@ ifndef XDOTKERNEL
XDOTKERNEL = zdot.S XDOTKERNEL = zdot.S
endif endif
ifndef SHDOTKERNEL
SHDOTKERNEL = shdot.c
endif
ifndef TOBF16KERNEL
TOBF16KERNEL = tobf16.c
endif
ifndef BF16TOKERNEL
BF16TOKERNEL = bf16to.c
endif
ifndef ISAMAXKERNEL ifndef ISAMAXKERNEL
ISAMAXKERNEL = iamax_sse.S ISAMAXKERNEL = iamax_sse.S
endif endif

114
kernel/x86_64/bf16to.c Normal file
View File

@ -0,0 +1,114 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stddef.h>
#include "common.h"
#if defined(DOUBLE)
#define FLOAT_TYPE double
#elif defined(SINGLE)
#define FLOAT_TYPE float
#else
#endif
/* Notes for algorithm:
* - Input denormal treated as zero
* - Force to be QNAN
*/
static void bf16to_kernel_1(BLASLONG n, const bfloat16 * in, BLASLONG inc_in, FLOAT_TYPE * out, BLASLONG inc_out)
{
BLASLONG register index_in = 0;
BLASLONG register index_out = 0;
BLASLONG register index = 0;
uint16_t * tmp = NULL;
#if defined(DOUBLE)
float float_out = 0.0;
#endif
while(index<n) {
#if defined(DOUBLE)
float_out = 0.0;
tmp = (uint16_t*)(&float_out);
#else
*(out+index_out) = 0;
tmp = (uint16_t*)(out+index_out);
#endif
switch((*(in+index_in)) & 0xff80u) {
case (0x0000u): /* Type 1: Positive denormal */
tmp[1] = 0x0000u;
tmp[0] = 0x0000u;
break;
case (0x8000u): /* Type 2: Negative denormal */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
tmp[1] = 0x8000u;
tmp[0] = 0x0000u;
#else
tmp[1] = 0x0000u;
tmp[0] = 0x8000u;
#endif
break;
case (0x7f80u): /* Type 3: Positive infinity or NAN */
case (0xff80u): /* Type 4: Negative infinity or NAN */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
tmp[1] = *(in+index_in);
#else
tmp[0] = *(in+index_in);
#endif
/* Specific for NAN */
if (((*(in+index_in)) & 0x007fu) != 0) {
/* Force to be QNAN */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
tmp[1] |= 0x0040u;
#else
tmp[0] |= 0x0040u;
#endif
}
break;
default: /* Type 5: Normal case */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
tmp[1] = *(in+index_in);
#else
tmp[0] = *(in+index_in);
#endif
break;
}
#if defined(DOUBLE)
*(out+index_out) = (double)float_out;
#endif
index_in += inc_in;
index_out += inc_out;
index++;
}
}
void CNAME(BLASLONG n, bfloat16 * in, BLASLONG inc_in, FLOAT_TYPE * out, BLASLONG inc_out)
{
if (n <= 0) return;
bf16to_kernel_1(n, in, inc_in, out, inc_out);
}

View File

@ -0,0 +1,104 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_TOBF16_ACCL_KERNEL 1
#include "common.h"
#include <immintrin.h>
static void tobf16_accl_kernel(BLASLONG n, const double * in, bfloat16 * out)
{
/* Get the 64-bytes unaligned header number targeting for avx512
* processing (Assume input float array is natural aligned) */
int align_header = ((64 - ((uintptr_t)in & (uintptr_t)0x3f)) >> 3) & 0x7;
if (n < align_header) {align_header = n;}
if (align_header != 0) {
unsigned char align_mask8 = (((unsigned char)0xff) >> (8-align_header));
__m512d a = _mm512_maskz_loadu_pd(*((__mmask8*) &align_mask8), &in[0]);
_mm_mask_storeu_epi16(&out[0], *((__mmask8*) &align_mask8), (__m128i) _mm256_cvtneps_pbh(_mm512_cvtpd_ps(a)));
}
if (n == align_header) {
return;
} else {
n -= align_header;
in += align_header;
out += align_header;
}
int tail_index_8 = n&(~7);
int tail_index_32 = n&(~31);
int tail_index_128 = n&(~127);
unsigned char tail_mask8 = (((unsigned char) 0xff) >> (8 -(n&7)));
/* Processing the main chunk with 128-elements per round */
for (int i = 0; i < tail_index_128; i += 128) {
// Fold 1
__m512 data1_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+ 0]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+ 8])), 1);
__m512 data1_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+16]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+24])), 1);
_mm512_storeu_si512(&out[i+ 0], (__m512i) _mm512_cvtne2ps_pbh(data1_512_high, data1_512_low));
// Fold 2
__m512 data2_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+32]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+40])), 1);
__m512 data2_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+48]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+56])), 1);
_mm512_storeu_si512(&out[i+32], (__m512i) _mm512_cvtne2ps_pbh(data2_512_high, data2_512_low));
// Fold 3
__m512 data3_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+64]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+72])), 1);
__m512 data3_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+80]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+88])), 1);
_mm512_storeu_si512(&out[i+64], (__m512i) _mm512_cvtne2ps_pbh(data3_512_high, data3_512_low));
// Fold 4
__m512 data4_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+96]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+104])), 1);
__m512 data4_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[i+112]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[i+120])), 1);
_mm512_storeu_si512(&out[i+96], (__m512i) _mm512_cvtne2ps_pbh(data4_512_high, data4_512_low));
}
/* Processing the remaining <128 chunk with 32-elements per round */
for (int j = tail_index_128; j < tail_index_32; j += 32) {
__m512 data1_512_low = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[j+ 0]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[j+ 8])), 1);
__m512 data1_512_high = _mm512_insertf32x8(_mm512_castps256_ps512(_mm512_cvtpd_ps(_mm512_load_pd(&in[j+16]))), _mm512_cvtpd_ps(_mm512_load_pd(&in[j+24])), 1);
_mm512_storeu_si512(&out[j], (__m512i) _mm512_cvtne2ps_pbh(data1_512_high, data1_512_low));
}
/* Processing the remaining <32 chunk with 8-elements per round */
for (int j = tail_index_32; j < tail_index_8; j += 8) {
_mm_storeu_si128((__m128i *)&out[j], (__m128i) _mm256_cvtneps_pbh(_mm512_cvtpd_ps(_mm512_load_pd(&in[j]))));
}
/* Processing the remaining <8 chunk with masked processing */
if ((n&7) > 0) {
__m512d data_512 = _mm512_maskz_load_pd(*((__mmask8*) &tail_mask8), &in[tail_index_8]);
_mm_mask_storeu_epi16(&out[tail_index_8], *((__mmask8*) &tail_mask8), (__m128i) _mm256_cvtneps_pbh(_mm512_cvtpd_ps(data_512)));
}
}
#endif

115
kernel/x86_64/shdot.c Normal file
View File

@ -0,0 +1,115 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(COOPERLAKE)
#include "shdot_microk_cooperlake.c"
#endif
static float shdot_compute(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y)
{
float d = 0.0;
#ifdef HAVE_SHDOT_ACCL_KERNEL
if ((inc_x == 1) && (inc_y == 1)) {
return shdot_accl_kernel(n, x, y);
}
#endif
float * x_fp32 = malloc(sizeof(float)*n);
float * y_fp32 = malloc(sizeof(float)*n);
SBF16TOS_K(n, x, inc_x, x_fp32, 1);
SBF16TOS_K(n, y, inc_y, y_fp32, 1);
d = SDOTU_K(n, x_fp32, 1, y_fp32, 1);
free(x_fp32);
free(y_fp32);
return d;
}
#if defined(SMP)
static int shdot_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, bfloat16 dummy2,
bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y,
float *result, BLASLONG dummy3)
{
*(float *)result = shdot_compute(n, x, inc_x, y, inc_y);
return 0;
}
extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc,
int (*function)(), int nthreads);
#endif
float CNAME(BLASLONG n, bfloat16 *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y)
{
float dot_result = 0.0;
if (n <= 0) return 0.0;
#if defined(SMP)
int nthreads;
int thread_thres = 40960;
bfloat16 dummy_alpha;
#endif
#if defined(SMP)
if (inc_x == 0 || inc_y == 0 || n <= thread_thres)
nthreads = 1;
else
nthreads = num_cpu_avail(1);
int best_threads = (int) (n/(float)thread_thres + 0.5);
if (best_threads < nthreads) {
nthreads = best_threads;
}
if (nthreads <= 1) {
dot_result = shdot_compute(n, x, inc_x, y, inc_y);
} else {
char thread_result[MAX_CPU_NUMBER * sizeof(double) * 2];
int mode = BLAS_BFLOAT16 | BLAS_REAL;
blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha,
x, inc_x, y, inc_y, thread_result, 0,
(void *)shdot_thread_func, nthreads);
float * ptr = (float *)thread_result;
for (int i = 0; i < nthreads; i++) {
dot_result += (*ptr);
ptr = (float *)(((char *)ptr) + sizeof(double) * 2);
}
}
#else
dot_result = shdot_compute(n, x, inc_x, y, inc_y);
#endif
return dot_result;
}

View File

@ -0,0 +1,159 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_SHDOT_ACCL_KERNEL 1
#include "common.h"
#include <immintrin.h>
static float shdot_accl_kernel(BLASLONG n, bfloat16 *x, bfloat16 *y)
{
__m128 accum128 = _mm_setzero_ps();
if (n> 127) { /* n range from 128 to inf. */
long tail_index_32 = n&(~31);
long tail_index_128 = n&(~127);
unsigned int tail_mask_uint = (((unsigned int)0xffffffff) >> (32-(n&31)));
__mmask32 tail_mask = *((__mmask32*) &tail_mask_uint);
__m512 accum512_0 = _mm512_setzero_ps();
__m512 accum512_1 = _mm512_setzero_ps();
__m512 accum512_2 = _mm512_setzero_ps();
__m512 accum512_3 = _mm512_setzero_ps();
/* Processing the main chunk with 128-elements per round */
for (long i = 0; i < tail_index_128; i += 128) {
accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) _mm512_loadu_si512(&x[i+ 0]), (__m512bh) _mm512_loadu_si512(&y[i+ 0]));
accum512_1 = _mm512_dpbf16_ps(accum512_1, (__m512bh) _mm512_loadu_si512(&x[i+32]), (__m512bh) _mm512_loadu_si512(&y[i+32]));
accum512_2 = _mm512_dpbf16_ps(accum512_2, (__m512bh) _mm512_loadu_si512(&x[i+64]), (__m512bh) _mm512_loadu_si512(&y[i+64]));
accum512_3 = _mm512_dpbf16_ps(accum512_3, (__m512bh) _mm512_loadu_si512(&x[i+96]), (__m512bh) _mm512_loadu_si512(&y[i+96]));
}
/* Processing the remaining <128 chunk with 32-elements per round */
for (long j = tail_index_128; j < tail_index_32; j += 32) {
accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) _mm512_loadu_si512(&x[j]), (__m512bh) _mm512_loadu_si512(&y[j]));
}
/* Processing the remaining <32 chunk with masked 32-elements processing */
if ((n&31) != 0) {
accum512_2 = _mm512_dpbf16_ps(accum512_2,
(__m512bh) _mm512_maskz_loadu_epi16(tail_mask, &x[tail_index_32]),
(__m512bh) _mm512_maskz_loadu_epi16(tail_mask, &y[tail_index_32]));
}
/* Accumulate the 4 registers into 1 register */
accum512_0 = _mm512_add_ps(accum512_0, accum512_1);
accum512_2 = _mm512_add_ps(accum512_2, accum512_3);
accum512_0 = _mm512_add_ps(accum512_0, accum512_2);
__m256 accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1));
accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
} else if (n > 31) { /* n range from 32 to 127 */
/* Processing <128 chunk with 32-elements per round */
__m256 accum256 = _mm256_setzero_ps();
__m256 accum256_1 = _mm256_setzero_ps();
int tail_index_32 = n&(~31);
for (int j = 0; j < tail_index_32; j += 32) {
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[j+ 0]), (__m256bh) _mm256_loadu_si256(&y[j+ 0]));
accum256_1 = _mm256_dpbf16_ps(accum256_1, (__m256bh) _mm256_loadu_si256(&x[j+16]), (__m256bh) _mm256_loadu_si256(&y[j+16]));
}
accum256 = _mm256_add_ps(accum256, accum256_1);
/* Processing the remaining <32 chunk with 16-elements processing */
if ((n&16) != 0) {
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[tail_index_32]), (__m256bh) _mm256_loadu_si256(&y[tail_index_32]));
}
accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
/* Processing the remaining <16 chunk with 8-elements processing */
if ((n&8) != 0) {
int tail_index_16 = n&(~15);
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
}
/* Processing the remaining <8 chunk with masked 8-elements processing */
if ((n&7) != 0) {
unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7)));
__mmask8 tail_mask = *((__mmask8*) &tail_mask_uint);
int tail_index_8 = n&(~7);
accum128 = _mm_dpbf16_ps(accum128,
(__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[tail_index_8]),
(__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[tail_index_8]));
}
} else if (n > 15) { /* n range from 16 to 31 */
/* Processing <32 chunk with 16-elements processing */
__m256 accum256 = _mm256_setzero_ps();
accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) _mm256_loadu_si256(&x[0]), (__m256bh) _mm256_loadu_si256(&y[0]));
accum128 += _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf128_ps(accum256, 1));
/* Processing the remaining <16 chunk with 8-elements processing */
if ((n&8) != 0) {
int tail_index_16 = n&(~15);
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[tail_index_16]), (__m128bh) _mm_loadu_si128(&y[tail_index_16]));
}
/* Processing the remaining <8 chunk with masked 8-elements processing */
if ((n&7) != 0) {
unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7)));
__mmask8 tail_mask = *((__mmask8*) &tail_mask_uint);
int tail_index_8 = n&(~7);
accum128 = _mm_dpbf16_ps(accum128,
(__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[tail_index_8]),
(__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[tail_index_8]));
}
} else if (n > 7) { /* n range from 8 to 15 */
/* Processing <16 chunk with 8-elements processing */
accum128 = _mm_dpbf16_ps(accum128, (__m128bh) _mm_loadu_si128(&x[0]), (__m128bh) _mm_loadu_si128(&y[0]));
/* Processing the remaining <8 chunk with masked 8-elements processing */
if ((n&7) != 0) {
unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7)));
__mmask8 tail_mask = *((__mmask8*) &tail_mask_uint);
int tail_index_8 = n&(~7);
accum128 = _mm_dpbf16_ps(accum128,
(__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[tail_index_8]),
(__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[tail_index_8]));
}
} else { /* n range from 1 to 7 */
unsigned char tail_mask_uint = (((unsigned char)0xff) >> (8-(n&7)));
__mmask8 tail_mask = *((__mmask8*) &tail_mask_uint);
accum128 = _mm_dpbf16_ps(accum128,
(__m128bh) _mm_maskz_loadu_epi16(tail_mask, &x[0]),
(__m128bh) _mm_maskz_loadu_epi16(tail_mask, &y[0]));
}
/* Add up the 4 elements into lowest entry */
__m128 accum128_1 = _mm_shuffle_ps(accum128, accum128, 14);
accum128 = _mm_add_ps(accum128, accum128_1);
accum128_1 = _mm_shuffle_ps(accum128, accum128, 1);
accum128 = _mm_add_ps(accum128, accum128_1);
return accum128[0];
}
#endif

View File

@ -0,0 +1,86 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_TOBF16_ACCL_KERNEL 1
#include "common.h"
#include <immintrin.h>
static void tobf16_accl_kernel(BLASLONG n, const float * in, bfloat16 * out)
{
/* Get the 64-bytes unaligned header number targeting for avx512
* processing (Assume input float array is natural aligned) */
int align_header = ((64 - ((uintptr_t)in & (uintptr_t)0x3f)) >> 2) & 0xf;
if (n < align_header) {align_header = n;}
if (align_header != 0) {
uint16_t align_mask16 = (((uint16_t)0xffff) >> (16-align_header));
__m512 a = _mm512_maskz_loadu_ps(*((__mmask16*) &align_mask16), &in[0]);
_mm256_mask_storeu_epi16(&out[0], *((__mmask16*) &align_mask16), (__m256i) _mm512_cvtneps_pbh(a));
}
if (n == align_header) {
return;
} else {
n -= align_header;
in += align_header;
out += align_header;
}
int tail_index_32 = n&(~31);
int tail_index_128 = n&(~127);
uint32_t tail_mask32 = (((uint32_t) 0xffffffff) >> (32-(n&31)));
uint16_t tail_mask16 = (((uint16_t) 0xffff) >> (16-(n&15)));
/* Processing the main chunk with 128-elements per round */
for (int i = 0; i < tail_index_128; i += 128) {
_mm512_storeu_si512(&out[i+ 0], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+ 16]), _mm512_load_ps(&in[i+ 0])));
_mm512_storeu_si512(&out[i+32], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+ 48]), _mm512_load_ps(&in[i+32])));
_mm512_storeu_si512(&out[i+64], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+ 80]), _mm512_load_ps(&in[i+64])));
_mm512_storeu_si512(&out[i+96], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[i+112]), _mm512_load_ps(&in[i+96])));
}
/* Processing the remaining <128 chunk with 32-elements per round */
for (int j = tail_index_128; j < tail_index_32; j += 32) {
_mm512_storeu_si512(&out[j], (__m512i) _mm512_cvtne2ps_pbh(_mm512_load_ps(&in[j+ 16]), _mm512_load_ps(&in[j])));
}
/* Processing the remaining <32 chunk with masked processing */
if ((n&31) > 15) {
__m512 b = _mm512_load_ps(&in[tail_index_32]);
__m512 a = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &in[tail_index_32+16]);
_mm512_mask_storeu_epi16(&out[tail_index_32], *((__mmask32*) &tail_mask32), (__m512i) _mm512_cvtne2ps_pbh(a, b));
} else if ((n&31) > 0) {
__m512 a = _mm512_maskz_load_ps(*((__mmask16*) &tail_mask16), &in[tail_index_32]);
_mm256_mask_storeu_epi16(&out[tail_index_32], *((__mmask16*) &tail_mask16), (__m256i) _mm512_cvtneps_pbh(a));
}
}
#endif

170
kernel/x86_64/tobf16.c Normal file
View File

@ -0,0 +1,170 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <stddef.h>
#include "common.h"
#if defined(DOUBLE)
#define FLOAT_TYPE double
#elif defined(SINGLE)
#define FLOAT_TYPE float
#else
#endif
#if defined(COOPERLAKE)
#if defined(DOUBLE)
#include "dtobf16_microk_cooperlake.c"
#elif defined(SINGLE)
#include "stobf16_microk_cooperlake.c"
#endif
#endif
/* Notes for algorithm:
* - Round to Nearest Even used generally
* - QNAN for NAN case
* - Input denormals are treated as zero
*/
static void tobf16_generic_kernel(BLASLONG n, const FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLONG inc_out)
{
BLASLONG register index_in = 0;
BLASLONG register index_out = 0;
BLASLONG register index = 0;
float float_in = 0.0;
uint32_t * uint32_in = (uint32_t *)(&float_in);
uint16_t * uint16_in = (uint16_t *)(&float_in);
while(index<n) {
#if defined(DOUBLE)
float_in = (float)(*(in+index_in));
#else
float_in = *(in+index_in);
#endif
switch((*uint32_in) & 0xff800000u) {
case (0x00000000u): /* Type 1: Positive denormal */
*(out+index_out) = 0x0000u;
break;
case (0x80000000u): /* Type 2: Negative denormal */
*(out+index_out) = 0x8000u;
break;
case (0x7f800000u): /* Type 3: Positive infinity or NAN */
case (0xff800000u): /* Type 4: Negative infinity or NAN */
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
*(out+index_out) = uint16_in[1];
#else
*(out+index_out) = uint16_in[0];
#endif
/* Specific for NAN */
if (((*uint32_in) & 0x007fffffu) != 0) {
/* Force to be QNAN */
*(out+index_out) |= 0x0040u;
}
break;
default: /* Type 5: Normal case */
(*uint32_in) += ((((*uint32_in) >> 16) & 0x1u) + 0x7fffu);
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
*(out+index_out) = uint16_in[1];
#else
*(out+index_out) = uint16_in[0];
#endif
break;
}
index_in += inc_in;
index_out += inc_out;
index++;
}
}
#ifndef HAVE_TOBF16_ACCL_KERNEL
static void tobf16_accl_kernel(BLASLONG n, const FLOAT_TYPE * in, bfloat16 * out)
{
tobf16_generic_kernel(n, in, 1, out, 1);
}
#endif
static void tobf16_compute(BLASLONG n, FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLONG inc_out)
{
if ((inc_in == 1) && (inc_out == 1)) {
tobf16_accl_kernel(n, in, out);
} else {
tobf16_generic_kernel(n, in, inc_in, out, inc_out);
}
}
#if defined(SMP)
static int tobf16_thread_func(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT_TYPE dummy2,
FLOAT_TYPE *x, BLASLONG inc_x, bfloat16 *y, BLASLONG inc_y,
FLOAT_TYPE *dummy3, BLASLONG dummy4)
{
tobf16_compute(n, x, inc_x, y, inc_y);
return 0;
}
extern int blas_level1_thread(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha,
void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc,
int (*function)(), int nthreads);
#endif
void CNAME(BLASLONG n, FLOAT_TYPE * in, BLASLONG inc_in, bfloat16 * out, BLASLONG inc_out)
{
if (n <= 0) return;
#if defined(SMP)
int nthreads;
FLOAT_TYPE dummy_alpha;
FLOAT_TYPE dummy_c;
#endif
#if defined(SMP)
if (inc_in == 0 || inc_out == 0 || n <= 100000) {
nthreads = 1;
} else {
if (n/100000 < 100) {
nthreads = 4;
} else {
nthreads = 16;
}
}
if (nthreads == 1) {
tobf16_compute(n, in, inc_in, out, inc_out);
} else {
#if defined(DOUBLE)
int mode = BLAS_REAL | BLAS_DTOBF16;
#elif defined(SINGLE)
int mode = BLAS_REAL | BLAS_STOBF16;
#endif
blas_level1_thread(mode, n, 0, 0, &dummy_alpha,
in, inc_in, out, inc_out, &dummy_c, 0,
(void *)tobf16_thread_func, nthreads);
}
#else
tobf16_compute(n, in, inc_in, out, inc_out);
#endif
}

View File

@ -35,7 +35,8 @@ typedef unsigned long BLASULONG;
#endif #endif
#ifndef BFLOAT16 #ifndef BFLOAT16
typedef unsigned short bfloat16; #include <stdint.h>
typedef uint16_t bfloat16;
#endif #endif
#ifdef OPENBLAS_USE64BITINT #ifdef OPENBLAS_USE64BITINT

View File

@ -3,11 +3,18 @@ include_directories(${PROJECT_BINARY_DIR})
enable_language(Fortran) enable_language(Fortran)
set(OpenBLAS_Tests if (BUILD_SINGLE)
sblat1 sblat2 sblat3 list( APPEND OpenBLAS_Tests sblat1 sblat2 sblat3)
dblat1 dblat2 dblat3 endif()
cblat1 cblat2 cblat3 if (BUILD_DOUBLE)
zblat1 zblat2 zblat3) list (APPEND OpenBLAS_Tests dblat1 dblat2 dblat3)
endif()
if (BUILD_COMPLEX)
list (APPEND OpenBLAS_Tests cblat1 cblat2 cblat3)
endif()
if (BUILD_COMPLEX16)
list (APPEND OpenBLAS_Tests zblat1 zblat2 zblat3)
endif()
foreach(test_bin ${OpenBLAS_Tests}) foreach(test_bin ${OpenBLAS_Tests})
add_executable(${test_bin} ${test_bin}.f) add_executable(${test_bin} ${test_bin}.f)

View File

@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "openblas_utest.h" #include "openblas_utest.h"
#ifdef BUILD_SINGLE
CTEST(amax, samax){ CTEST(amax, samax){
blasint N=3, inc=1; blasint N=3, inc=1;
float te_max=0.0, tr_max=0.0; float te_max=0.0, tr_max=0.0;
@ -43,7 +44,8 @@ CTEST(amax, samax){
ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS);
} }
#endif
#ifdef BUILD_DOUBLE
CTEST(amax, damax){ CTEST(amax, damax){
blasint N=3, inc=1; blasint N=3, inc=1;
double te_max=0.0, tr_max=0.0; double te_max=0.0, tr_max=0.0;
@ -54,3 +56,5 @@ CTEST(amax, damax){
ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
} }
#endif

View File

@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "openblas_utest.h" #include "openblas_utest.h"
#ifdef BUILD_DOUBLE
CTEST(axpy,daxpy_inc_0) CTEST(axpy,daxpy_inc_0)
{ {
blasint i; blasint i;
@ -52,7 +53,9 @@ CTEST(axpy,daxpy_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
} }
} }
#endif
#ifdef BUILD_COMPLEX16
CTEST(axpy,zaxpy_inc_0) CTEST(axpy,zaxpy_inc_0)
{ {
blasint i; blasint i;
@ -71,7 +74,9 @@ CTEST(axpy,zaxpy_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
} }
} }
#endif
#ifdef BUILD_SINGLE
CTEST(axpy,saxpy_inc_0) CTEST(axpy,saxpy_inc_0)
{ {
blasint i; blasint i;
@ -90,7 +95,9 @@ CTEST(axpy,saxpy_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
} }
} }
#endif
#ifdef BUILD_COMPLEX
CTEST(axpy,caxpy_inc_0) CTEST(axpy,caxpy_inc_0)
{ {
blasint i; blasint i;
@ -109,3 +116,5 @@ CTEST(axpy,caxpy_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
} }
} }
#endif

View File

@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "openblas_utest.h" #include "openblas_utest.h"
#ifdef BUILD_COMPLEX16
CTEST( zdotu,zdotu_n_1) CTEST( zdotu,zdotu_n_1)
{ {
blasint N=1,incX=1,incY=1; blasint N=1,incX=1,incY=1;
@ -80,3 +81,5 @@ CTEST(zdotu, zdotu_offset_1)
#endif #endif
} }
#endif

View File

@ -36,6 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ELEMENTS 50 #define ELEMENTS 50
#define INCREMENT 2 #define INCREMENT 2
#ifdef BUILD_SINGLE
CTEST(ismin, positive_step_2){ CTEST(ismin, positive_step_2){
blasint i; blasint i;
blasint N = ELEMENTS, inc = INCREMENT; blasint N = ELEMENTS, inc = INCREMENT;
@ -87,3 +88,4 @@ CTEST(ismax, negative_step_2){
blasint index = BLASFUNC(ismax)(&N, x, &inc); blasint index = BLASFUNC(ismax)(&N, x, &inc);
ASSERT_EQUAL(9, index); ASSERT_EQUAL(9, index);
} }
#endif

View File

@ -22,6 +22,7 @@ double m[DATASIZE*DATASIZE];
CTEST(kernel_regress,skx_avx) CTEST(kernel_regress,skx_avx)
{ {
#ifdef BUILD_DOUBLE
double norm; double norm;
int i, j, info; int i, j, info;
srand(0); srand(0);
@ -47,4 +48,5 @@ CTEST(kernel_regress,skx_avx)
norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1); norm = cblas_dnrm2(DATASIZE*DATASIZE, X, 1);
ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10); ASSERT_DBL_NEAR_TOL(0.0, norm, 1e-10);
#endif
} }

View File

@ -32,7 +32,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/ **********************************************************************************/
#include "openblas_utest.h" #include "openblas_utest.h"
#ifdef BUILD_SINGLE
CTEST(min, smin_negative){ CTEST(min, smin_negative){
blasint N=3, inc=1; blasint N=3, inc=1;
float te_min=0.0, tr_min=0.0; float te_min=0.0, tr_min=0.0;
@ -43,7 +43,9 @@ CTEST(min, smin_negative){
ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS); ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), SINGLE_EPS);
} }
#endif
#ifdef BUILD_DOUBLE
CTEST(min, dmin_positive){ CTEST(min, dmin_positive){
blasint N=3, inc=1; blasint N=3, inc=1;
double te_min=0.0, tr_min=0.0; double te_min=0.0, tr_min=0.0;
@ -54,7 +56,9 @@ CTEST(min, dmin_positive){
ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS); ASSERT_DBL_NEAR_TOL((double)(tr_min), (double)(te_min), DOUBLE_EPS);
} }
#endif
#ifdef BUILD_SINGLE
CTEST(min, smin_zero){ CTEST(min, smin_zero){
blasint N=3, inc=1; blasint N=3, inc=1;
float te_min=0.0, tr_min=0.0; float te_min=0.0, tr_min=0.0;
@ -76,7 +80,9 @@ CTEST(max, smax_negative){
ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS);
} }
#endif
#ifdef BUILD_DOUBLE
CTEST(max, dmax_positive){ CTEST(max, dmax_positive){
blasint N=3, inc=1; blasint N=3, inc=1;
double te_max=0.0, tr_max=0.0; double te_max=0.0, tr_max=0.0;
@ -87,7 +93,8 @@ CTEST(max, dmax_positive){
ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS); ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), DOUBLE_EPS);
} }
#endif
#ifdef BUILD_SINGLE
CTEST(max, smax_zero){ CTEST(max, smax_zero){
blasint N=3, inc=1; blasint N=3, inc=1;
float te_max=0.0, tr_max=0.0; float te_max=0.0, tr_max=0.0;
@ -98,3 +105,5 @@ CTEST(max, smax_zero){
ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS); ASSERT_DBL_NEAR_TOL((double)(tr_max), (double)(te_max), SINGLE_EPS);
} }
#endif

View File

@ -39,7 +39,6 @@ void BLASFUNC(zpotrs_(char*, BLASINT*, BLASINT*, complex double*,
BLASINT*, complex double*, BLASINT*, BLASINT*); BLASINT*, complex double*, BLASINT*, BLASINT*);
*/ */
//https://github.com/xianyi/OpenBLAS/issues/695 //https://github.com/xianyi/OpenBLAS/issues/695
CTEST(potrf, bug_695){ CTEST(potrf, bug_695){
@ -151,8 +150,10 @@ CTEST(potrf, bug_695){
blasint n=10; blasint n=10;
blasint info[1]; blasint info[1];
#ifdef BUILD_COMPLEX
BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info); BLASFUNC(cpotrf)(&up, &n, (float*)(A1), &n, info);
//printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91])); //printf("%g+%g*I\n", creal(A1[91]), cimag(A1[91]));
#endif
openblas_complex_double A2[100] = openblas_complex_double A2[100] =
{ {
@ -282,8 +283,9 @@ CTEST(potrf, bug_695){
}; };
char lo = 'L'; char lo = 'L';
blasint nrhs = 2; blasint nrhs = 2;
#ifdef BUILD_COMPLEX16
BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info); BLASFUNC(zpotrs)(&lo, &n, &nrhs, (double*)(A2), &n, (double*)(B), &n, info);
#endif
// note that this is exactly equal to A1 // note that this is exactly equal to A1
openblas_complex_float A3[100] = openblas_complex_float A3[100] =
{ {
@ -388,14 +390,15 @@ CTEST(potrf, bug_695){
openblas_make_complex_float(-0.9617417, -1.2486815), openblas_make_complex_float(-0.9617417, -1.2486815),
openblas_make_complex_float(3.4629636, +0.0) openblas_make_complex_float(3.4629636, +0.0)
}; };
#ifdef BUILD_COMPLEX
BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info); BLASFUNC(cpotrf)(&up, &n, (float*)(A3), &n, info);
// printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91])); // printf("%g+%g*I\n", creal(A3[91]), cimag(A3[91]));
if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) { if(isnan(CREAL(A3[91])) || isnan(CIMAG(A3[91]))) {
CTEST_ERR("%s:%d got NaN", __FILE__, __LINE__); CTEST_ERR("%s:%d got NaN", __FILE__, __LINE__);
} }
#endif
} }
// Check potrf factorizes a small problem correctly // Check potrf factorizes a small problem correctly
CTEST(potrf, smoketest_trivial){ CTEST(potrf, smoketest_trivial){
float A1s[4] = {2, 0.3, 0.3, 3}; float A1s[4] = {2, 0.3, 0.3, 3};
@ -439,31 +442,43 @@ CTEST(potrf, smoketest_trivial){
uplo = 'U'; uplo = 'U';
} }
#ifdef BUILD_SINGLE
BLASFUNC(scopy)(&nv, A1s, &inc, As, &inc); BLASFUNC(scopy)(&nv, A1s, &inc, As, &inc);
#endif
#ifdef BUILD_DOUBLE
BLASFUNC(dcopy)(&nv, A1d, &inc, Ad, &inc); BLASFUNC(dcopy)(&nv, A1d, &inc, Ad, &inc);
#endif
#ifdef BUILD_COMPLEX
BLASFUNC(ccopy)(&nv, (float *)A1c, &inc, (float *)Ac, &inc); BLASFUNC(ccopy)(&nv, (float *)A1c, &inc, (float *)Ac, &inc);
#endif
#ifdef BUILD_COMPLEX16
BLASFUNC(zcopy)(&nv, (double *)A1z, &inc, (double *)Az, &inc); BLASFUNC(zcopy)(&nv, (double *)A1z, &inc, (double *)Az, &inc);
#endif
#ifdef BUILD_SINGLE
BLASFUNC(spotrf)(&uplo, &n, As, &n, &info); BLASFUNC(spotrf)(&uplo, &n, As, &n, &info);
if (info != 0) { if (info != 0) {
CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__);
} }
#endif
#ifdef BUILD_DOUBLE
BLASFUNC(dpotrf)(&uplo, &n, Ad, &n, &info); BLASFUNC(dpotrf)(&uplo, &n, Ad, &n, &info);
if (info != 0) { if (info != 0) {
CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__);
} }
#endif
#ifdef BUILD_COMPLEX
BLASFUNC(cpotrf)(&uplo, &n, (float *)Ac, &n, &info); BLASFUNC(cpotrf)(&uplo, &n, (float *)Ac, &n, &info);
if (info != 0) { if (info != 0) {
CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__);
} }
#endif
#ifdef BUILD_COMPLEX16
BLASFUNC(zpotrf)(&uplo, &n, (double *)Az, &n, &info); BLASFUNC(zpotrf)(&uplo, &n, (double *)Az, &n, &info);
if (info != 0) { if (info != 0) {
CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__); CTEST_ERR("%s:%d info != 0", __FILE__, __LINE__);
} }
#endif
/* Fill the other triangle */ /* Fill the other triangle */
if (uplo == 'L') { if (uplo == 'L') {
for (i = 0; i < n; ++i) { for (i = 0; i < n; ++i) {
@ -495,14 +510,20 @@ CTEST(potrf, smoketest_trivial){
trans1 = 'C'; trans1 = 'C';
trans2 = 'N'; trans2 = 'N';
} }
#ifdef BUILD_SINGLE
BLASFUNC(sgemm)(&trans1, &trans2, &n, &n, &n, &ones, As, &n, As, &n, &zeros, Bs, &n); BLASFUNC(sgemm)(&trans1, &trans2, &n, &n, &n, &ones, As, &n, As, &n, &zeros, Bs, &n);
#endif
#ifdef BUILD_DOUBLE
BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, Ad, &n, Ad, &n, &zerod, Bd, &n); BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, Ad, &n, Ad, &n, &zerod, Bd, &n);
#endif
#ifdef BUILD_COMPLEX
BLASFUNC(cgemm)(&trans1, &trans2, &n, &n, &n, (float *)&onec, BLASFUNC(cgemm)(&trans1, &trans2, &n, &n, &n, (float *)&onec,
(float *)Ac, &n, (float *)Ac, &n, (float *)&zeroc, (float *)Bc, &n); (float *)Ac, &n, (float *)Ac, &n, (float *)&zeroc, (float *)Bc, &n);
#endif
#ifdef BUILD_COMPLEX16
BLASFUNC(zgemm)(&trans1, &trans2, &n, &n, &n, (double *)&onez, BLASFUNC(zgemm)(&trans1, &trans2, &n, &n, &n, (double *)&onez,
(double *)Az, &n, (double *)Az, &n, (double *)&zeroz, (double *)Bz, &n); (double *)Az, &n, (double *)Az, &n, (double *)&zeroz, (double *)Bz, &n);
#endif
/* Check result is close to original */ /* Check result is close to original */
for (i = 0; i < n; ++i) { for (i = 0; i < n; ++i) {
for (j = 0; j < n; ++j) { for (j = 0; j < n; ++j) {

View File

@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "openblas_utest.h" #include "openblas_utest.h"
#ifdef BUILD_DOUBLE
CTEST(rot,drot_inc_0) CTEST(rot,drot_inc_0)
{ {
blasint i=0; blasint i=0;
@ -52,7 +53,9 @@ CTEST(rot,drot_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
} }
} }
#endif
#ifdef BUILD_COMPLEX16
CTEST(rot,zdrot_inc_0) CTEST(rot,zdrot_inc_0)
{ {
blasint i=0; blasint i=0;
@ -72,7 +75,9 @@ CTEST(rot,zdrot_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
} }
} }
#endif
#ifdef BUILD_SINGLE
CTEST(rot,srot_inc_0) CTEST(rot,srot_inc_0)
{ {
blasint i=0; blasint i=0;
@ -91,7 +96,9 @@ CTEST(rot,srot_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
} }
} }
#endif
#ifdef BUILD_COMPLEX
CTEST(rot, csrot_inc_0) CTEST(rot, csrot_inc_0)
{ {
blasint i=0; blasint i=0;
@ -110,3 +117,5 @@ CTEST(rot, csrot_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
} }
} }
#endif

View File

@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "openblas_utest.h" #include "openblas_utest.h"
#ifdef BUILD_DOUBLE
CTEST (drotmg,rotmg) CTEST (drotmg,rotmg)
{ {
double te_d1, tr_d1; double te_d1, tr_d1;
@ -204,3 +205,4 @@ CTEST(drotmg, drotmg_D1_big_D2_big_flag_zero)
ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(tr_param[i], te_param[i], DOUBLE_EPS);
} }
} }
#endif

View File

@ -33,6 +33,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include "openblas_utest.h" #include "openblas_utest.h"
#ifdef BUILD_DOUBLE
CTEST(swap,dswap_inc_0) CTEST(swap,dswap_inc_0)
{ {
blasint i=0; blasint i=0;
@ -50,7 +51,9 @@ CTEST(swap,dswap_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
} }
} }
#endif
#ifdef BUILD_COMPLEX16
CTEST(swap,zswap_inc_0) CTEST(swap,zswap_inc_0)
{ {
blasint i=0; blasint i=0;
@ -68,7 +71,9 @@ CTEST(swap,zswap_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], DOUBLE_EPS);
} }
} }
#endif
#ifdef BUILD_SINGLE
CTEST(swap,sswap_inc_0) CTEST(swap,sswap_inc_0)
{ {
blasint i=0; blasint i=0;
@ -86,7 +91,9 @@ CTEST(swap,sswap_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
} }
} }
#endif
#ifdef BUILD_COMPLEX
CTEST(swap,cswap_inc_0) CTEST(swap,cswap_inc_0)
{ {
blasint i=0; blasint i=0;
@ -104,3 +111,5 @@ CTEST(swap,cswap_inc_0)
ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS); ASSERT_DBL_NEAR_TOL(y2[i], y1[i], SINGLE_EPS);
} }
} }
#endif