diff --git a/CMakeLists.txt b/CMakeLists.txt index 21f0c9571..aeb4399e4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 11.dev) +set(OpenBLAS_PATCH_VERSION 12.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions diff --git a/Changelog.txt b/Changelog.txt index bd0e60992..edd3563ec 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,9 +1,36 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.12 + 24-Oct-2020 + +common: + * Fixed missing BLAS/LAPACK functions (inadvertently dropped during + the build system restructuring) + * Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458) + +POWER: + * Added optimized SCOPY/CCOPY kernels for POWER10 + * Increased and unified the default size of the GEMM BUFFER + * Fixed building for POWER10 in DYNAMIC_ARCH mode + * POWER10 compatibility test now checks binutils version as well + * Cleaned up compiler warnings + +x86_64: + * corrected compiler version checks for AVX2 compatibility + * added compiler option -mavx2 for building with flang + * fixed direct SGEMM pathway for small matrix sizes (broken by + the code refactoring in 0.3.11) + * fixed unhandled partial register clobbers in several kernels + for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer + +ARMV8: + * improved Apple Vortex support to include cross-compiling + ==================================================================== Version 0.3.11 17-Oct-2020 - common: +common: * API change: the newly added BFLOAT16 functions were renamed to use the letter "B" instead of "H" to avoid potential confusion with @@ -28,7 +55,7 @@ Version 0.3.11 * Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as enabling these options * Fixed detection of gfortran when invoked through an mpi wrapper - * Improve thread reinitialization performance with OpenMP xafter a fork + * Improve thread reinitialization performance with OpenMP after a fork * Added support for building only the subset of the library required for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE * Optional function name prefixes and suffixes are now correctly @@ -66,7 +93,6 @@ ARMV8: * Fixed cpu detection on BSD-like systems * Fixed compilation in -std=C18 mode - IBM Z: * Added support for compiling with the clang compiler * Improved GEMM performance on Z14 diff --git a/Makefile.arm b/Makefile.arm index fac6b56824..a27b58e84 100644 --- a/Makefile.arm +++ b/Makefile.arm @@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6) CCOMMON_OPT += -mfpu=vfp FCOMMON_OPT += -mfpu=vfp endif + +ifdef HAVE_NEON +CCOMMON_OPT += -mfpu=neon +FCOMMON_OPT += -mfpu=neon +endif diff --git a/Makefile.rule b/Makefile.rule index e8f8c2951..1a0965d08 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.11.dev +VERSION = 0.3.12.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library @@ -295,10 +295,13 @@ COMMON_PROF = -pg -# the below is not yet configurable, use cmake if you need to build only select types -BUILD_SINGLE = 1 -BUILD_DOUBLE = 1 -BUILD_COMPLEX = 1 -BUILD_COMPLEX16 = 1 +# By default the library contains BLAS functions (and LAPACK if selected) for all input types. +# To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only +# the functions for complex numbers, uncomment the desired type(s) below +# BUILD_SINGLE = 1 +# BUILD_DOUBLE = 1 +# BUILD_COMPLEX = 1 +# BUILD_COMPLEX16 = 1 +# # End of user configuration # diff --git a/Makefile.system b/Makefile.system index 30d8f4ccf..52d3e2cdc 100644 --- a/Makefile.system +++ b/Makefile.system @@ -319,6 +319,7 @@ ifeq ($(GCCVERSIONGTEQ7),1) else GCCDUMPVERSION_PARAM := -dumpversion endif +GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) endif @@ -855,7 +856,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG FCOMMON_OPT += -Mrecursive -Kieee ifeq ($(OSNAME), Linux) ifeq ($(ARCH), x86_64) -FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) +FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`) ifeq ($(FLANG_VENDOR),AOCC) FCOMMON_OPT += -fno-unroll-loops endif diff --git a/Makefile.x86_64 b/Makefile.x86_64 index a849f0b01..49a9a0a23 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -47,8 +47,6 @@ ifndef DYNAMIC_ARCH ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) # cooperlake support was added in 10.1 -GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) -GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1) ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) CCOMMON_OPT += -march=cooperlake FCOMMON_OPT += -march=cooperlake @@ -73,10 +71,7 @@ ifndef DYNAMIC_ARCH ifndef NO_AVX2 ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) -GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) +GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) CCOMMON_OPT += -mavx2 endif diff --git a/benchmark/amax.c b/benchmark/amax.c index 19ae95c8b..29310dd71 100644 --- a/benchmark/amax.c +++ b/benchmark/amax.c @@ -25,125 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AMAX #ifdef COMPLEX #ifdef DOUBLE -#define AMAX BLASFUNC(dzamax) +#define AMAX BLASFUNC(dzamax) #else -#define AMAX BLASFUNC(scamax) +#define AMAX BLASFUNC(scamax) #endif #else #ifdef DOUBLE -#define AMAX BLASFUNC(damax) +#define AMAX BLASFUNC(damax) #else -#define AMAX BLASFUNC(samax) +#define AMAX BLASFUNC(samax) #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) +{ FLOAT *x; blasint m, i; - blasint inc_x=1; + blasint inc_x = 1; int loops = 1; int l; char *p; + int from = 1; + int to = 200; + int step = 1; - int from = 1; - int to = 200; - int step = 1; + double time1, timeg; - struct timeval start, stop; - double time1,timeg; + argc--; + argv++; - argc--;argv++; + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); } #ifdef __linux @@ -152,37 +100,31 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + for (m = from; m <= to; m += step) { - timeg=0; + timeg = 0; + fprintf(stderr, " %6d : ", (int)m); - fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) + { + for (i = 0; i < m * COMPSIZE * abs(inc_x); i++) + { + x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5; + } - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AMIN #ifdef COMPLEX #ifdef DOUBLE -#define AMIN BLASFUNC(dzamin) +#define AMIN BLASFUNC(dzamin) #else -#define AMIN BLASFUNC(scamin) +#define AMIN BLASFUNC(scamin) #endif #else #ifdef DOUBLE -#define AMIN BLASFUNC(damin) +#define AMIN BLASFUNC(damin) #else -#define AMIN BLASFUNC(samin) +#define AMIN BLASFUNC(samin) #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) +{ FLOAT *x; blasint m, i; - blasint inc_x=1; + blasint inc_x = 1; int loops = 1; int l; char *p; - int from = 1; - int to = 200; - int step = 1; + int from = 1; + int to = 200; + int step = 1; - struct timeval start, stop; - double time1,timeg; + double time1, timeg; - argc--;argv++; + argc--; + argv++; - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; + } - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); } #ifdef __linux @@ -151,39 +100,35 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + for (m = from; m <= to; m += step) { - timeg=0; + timeg = 0; - fprintf(stderr, " %6d : ", (int)m); + fprintf(stderr, " %6d : ", (int)m); + for (l = 0; l < loops; l++) + { - for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef ASUM #ifdef COMPLEX #ifdef DOUBLE -#define ASUM BLASFUNC(dzasum) +#define ASUM BLASFUNC(dzasum) #else -#define ASUM BLASFUNC(scasum) +#define ASUM BLASFUNC(scasum) #endif #else #ifdef DOUBLE -#define ASUM BLASFUNC(dasum) +#define ASUM BLASFUNC(dasum) #else -#define ASUM BLASFUNC(sasum) +#define ASUM BLASFUNC(sasum) #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - -int main(int argc, char *argv[]){ +int main(int argc, char *argv[]) +{ FLOAT *x; FLOAT result; blasint m, i; - blasint inc_x=1; + blasint inc_x = 1; int loops = 1; int l; char *p; - int from = 1; - int to = 200; - int step = 1; - -#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) - struct timeval start, stop; - double time1,timeg; -#else - struct timespec start = { 0, 0 }, stop = { 0, 0 }; + int from = 1; + int to = 200; + int step = 1; double time1, timeg; -#endif - argc--;argv++; + argc--; + argv++; - if (argc > 0) { from = atol(*argv); argc--; argv++;} - if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} - if (argc > 0) { step = atol(*argv); argc--; argv++;} - - if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); - if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); - - fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); - - if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ - fprintf(stderr,"Out of Memory!!\n");exit(1); + if (argc > 0) + { + from = atol(*argv); + argc--; + argv++; + } + if (argc > 0) + { + to = MAX(atol(*argv), from); + argc--; + argv++; + } + if (argc > 0) + { + step = atol(*argv); + argc--; + argv++; } + if ((p = getenv("OPENBLAS_LOOPS"))) + loops = atoi(p); + if ((p = getenv("OPENBLAS_INCX"))) + inc_x = atoi(p); + + fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops); + + if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL) + { + fprintf(stderr, "Out of Memory!!\n"); + exit(1); + } #ifdef __linux srandom(getpid()); @@ -158,45 +100,33 @@ int main(int argc, char *argv[]){ fprintf(stderr, " SIZE Flops\n"); - for(m = from; m <= to; m += step) + for (m = from; m <= to; m += step) { - timeg=0; + timeg = 0; - fprintf(stderr, " %6d : ", (int)m); + fprintf(stderr, " %6d : ", (int)m); - for (l=0; l1) - timeg /= loops; + if (loops > 1) + timeg /= loops; #ifdef COMPLEX fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); #else fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); #endif - } return 0; diff --git a/benchmark/axpby.c b/benchmark/axpby.c index 793ee7e40..d02d9a889 100644 --- a/benchmark/axpby.c +++ b/benchmark/axpby.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AXPBY @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -129,7 +58,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -176,16 +104,10 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef AXPY @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -127,8 +56,6 @@ int main(int argc, char *argv[]){ int from = 1; int to = 200; int step = 1; - - struct timespec start, stop; double time1,timeg; argc--;argv++; @@ -175,13 +102,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - clock_gettime( CLOCK_REALTIME, &start); + begin(); AXPY (&m, alpha, x, &inc_x, y, &inc_y ); - clock_gettime( CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; + time1 = getsec(); timeg += time1; diff --git a/benchmark/bench.h b/benchmark/bench.h new file mode 100644 index 000000000..1f9b8986c --- /dev/null +++ b/benchmark/bench.h @@ -0,0 +1,104 @@ +#include +#include +#include +#ifdef __CYGWIN32__ +#include +#endif +#include "common.h" + +#if defined(__WIN32__) || defined(__WIN64__) + +#ifndef DELTA_EPOCH_IN_MICROSECS +#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL +#endif + +int gettimeofday(struct timeval *tv, void *tz){ + + FILETIME ft; + unsigned __int64 tmpres = 0; + static int tzflag; + + if (NULL != tv) + { + GetSystemTimeAsFileTime(&ft); + + tmpres |= ft.dwHighDateTime; + tmpres <<= 32; + tmpres |= ft.dwLowDateTime; + + /*converting file time to unix epoch*/ + tmpres /= 10; /*convert into microseconds*/ + tmpres -= DELTA_EPOCH_IN_MICROSECS; + tv->tv_sec = (long)(tmpres / 1000000UL); + tv->tv_usec = (long)(tmpres % 1000000UL); + } + + return 0; +} + +#endif + +#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 + +static void *huge_malloc(BLASLONG size){ + int shmid; + void *address; + +#ifndef SHM_HUGETLB +#define SHM_HUGETLB 04000 +#endif + + if ((shmid =shmget(IPC_PRIVATE, + (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), + SHM_HUGETLB | IPC_CREAT |0600)) < 0) { + printf( "Memory allocation failed(shmget).\n"); + exit(1); + } + + address = shmat(shmid, NULL, SHM_RND); + + if ((BLASLONG)address == -1){ + printf( "Memory allocation failed(shmat).\n"); + exit(1); + } + + shmctl(shmid, IPC_RMID, 0); + + return address; +} + + +#define malloc huge_malloc + +#endif + +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + struct timeval start, stop; +#else + struct timespec start = { 0, 0 }, stop = { 0, 0 }; +#endif + +double getsec() +{ +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; +#else + return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; +#endif +} + +void begin() { +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + gettimeofday( &start, (struct timezone *)0); +#else + clock_gettime(CLOCK_REALTIME, &start); +#endif +} + +void end() { +#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS) + gettimeofday( &stop, (struct timezone *)0); +#else + clock_gettime(CLOCK_REALTIME, &stop); +#endif +} \ No newline at end of file diff --git a/benchmark/cholesky.c b/benchmark/cholesky.c index 5908b6085..65b20d039 100644 --- a/benchmark/cholesky.c +++ b/benchmark/cholesky.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -71,41 +66,6 @@ double fabs(double); #endif #endif - - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - - static __inline double getmflops(int ratio, int m, double secs){ double mm = (double)m; @@ -145,7 +105,6 @@ int main(int argc, char *argv[]){ FLOAT maxerr; - struct timeval start, stop; double time1; argc--;argv++; @@ -220,20 +179,19 @@ int main(int argc, char *argv[]){ SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); - gettimeofday( &start, (struct timezone *)0); + begin(); POTRF(uplo[uplos], &m, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); - maxerr = 0.; if (!(uplos & 1)) { for (j = 0; j < m; j++) { diff --git a/benchmark/copy.c b/benchmark/copy.c index eb5148fff..c5e447521 100644 --- a/benchmark/copy.c +++ b/benchmark/copy.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef COPY @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -128,11 +57,9 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1 = 0.0, timeg = 0.0; long nanos = 0; time_t seconds = 0; - struct timespec time_start = { 0, 0 }, time_end = { 0, 0 }; argc--;argv++; @@ -176,15 +103,10 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef DOT - #ifdef DOUBLE #define DOT BLASFUNC(ddot) #else #define DOT BLASFUNC(sdot) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -122,7 +49,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -169,15 +95,12 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); result = DOT (&m, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - timeg += time1; + end(); + timeg += getsec(); } diff --git a/benchmark/geev.c b/benchmark/geev.c index 4fd2c8d6f..6e22cdfb6 100644 --- a/benchmark/geev.c +++ b/benchmark/geev.c @@ -36,13 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GEEV @@ -74,71 +68,6 @@ extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a, FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info ); #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; @@ -154,7 +83,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -223,7 +151,7 @@ int main(int argc, char *argv[]){ for(m = from; m <= to; m += step){ fprintf(stderr, " %6d : ", (int)m); - gettimeofday( &start, (struct timezone *)0); + begin(); lwork = -1; #ifndef COMPLEX @@ -239,14 +167,14 @@ int main(int argc, char *argv[]){ GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info); #endif - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "failed to compute eigenvalues .. %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", diff --git a/benchmark/gemm.c b/benchmark/gemm.c index 8cd14bbed..35f5096f3 100644 --- a/benchmark/gemm.c +++ b/benchmark/gemm.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GEMM @@ -55,71 +49,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ IFLOAT *a, *b; @@ -139,7 +68,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1, timeg; argc--;argv++; @@ -228,14 +156,14 @@ int main(int argc, char *argv[]){ ldc = m; fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k); - gettimeofday( &start, (struct timezone *)0); + begin(); for (j=0; j -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GEMM @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -133,7 +62,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -187,16 +115,12 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - timeg += time1; - + end(); + timeg += getsec(); } timeg /= loops; diff --git a/benchmark/gemv.c b/benchmark/gemv.c index fb1f541d3..a0001277a 100644 --- a/benchmark/gemv.c +++ b/benchmark/gemv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef GEMV @@ -52,72 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -137,7 +66,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -211,10 +139,10 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + end(); + time1 = getsec(); timeg += time1; } @@ -248,10 +176,10 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + end(); + time1 = getsec(); timeg += time1; } diff --git a/benchmark/ger.c b/benchmark/ger.c index d53d328f0..7ce08c3ad 100644 --- a/benchmark/ger.c +++ b/benchmark/ger.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef GER @@ -49,72 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -131,7 +59,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -198,16 +125,13 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -66,71 +61,6 @@ double fabs(double); #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -142,7 +72,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -194,22 +123,18 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); GESV (&m, &m, a, &m, ipiv, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); - - - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - + end(); + time1 = getsec(); fprintf(stderr, "%10.2f MFlops %10.6f s\n", COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1); - } return 0; diff --git a/benchmark/getri.c b/benchmark/getri.c index a07014768..98a860906 100644 --- a/benchmark/getri.c +++ b/benchmark/getri.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef GETRF #undef GETRI @@ -72,71 +67,6 @@ extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info); -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*work; @@ -148,7 +78,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -205,21 +134,21 @@ int main(int argc, char *argv[]){ exit(1); } - gettimeofday( &start, (struct timezone *)0); + begin(); lwork = -1; GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); lwork = (blasint)wkopt[0]; GETRI(&m, a, &m, ipiv, work, &lwork, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "failed compute inverse matrix .. %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops : %10.2f Sec : %d\n", diff --git a/benchmark/hbmv.c b/benchmark/hbmv.c index 60ba9fb89..35249bdf9 100644 --- a/benchmark/hbmv.c +++ b/benchmark/hbmv.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HBMV - #ifdef DOUBLE #define HBMV BLASFUNC(zhbmv) #else #define HBMV BLASFUNC(chbmv) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) { - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) { - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -125,7 +52,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -186,15 +112,13 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - timeg += time1; + timeg += getsec(); } diff --git a/benchmark/hemm.c b/benchmark/hemm.c index 2bc165458..a0a9985ad 100644 --- a/benchmark/hemm.c +++ b/benchmark/hemm.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HEMM @@ -41,72 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HEMM BLASFUNC(chemm) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -126,7 +54,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -170,13 +97,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/hemv.c b/benchmark/hemv.c index 98618a04e..ad130ddd0 100644 --- a/benchmark/hemv.c +++ b/benchmark/hemv.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HEMV - #ifdef DOUBLE #define HEMV BLASFUNC(zhemv) #else #define HEMV BLASFUNC(chemv) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -124,7 +51,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -182,13 +108,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/her.c b/benchmark/her.c index 010f8120d..cd1fb7f48 100644 --- a/benchmark/her.c +++ b/benchmark/her.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HER - #ifdef DOUBLE #define HER BLASFUNC(zher) #else #define HER BLASFUNC(cher) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x; @@ -126,8 +53,6 @@ int main(int argc, char *argv[]){ int from = 1; int to = 200; int step = 1; - - struct timeval start, stop; double time1; argc--;argv++; @@ -166,15 +91,13 @@ int main(int argc, char *argv[]){ x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HER (&uplo, &m, alpha, x, &incx, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - gettimeofday( &start, (struct timezone *)0); + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/her2.c b/benchmark/her2.c index 0f80f3ed9..d87bfd466 100644 --- a/benchmark/her2.c +++ b/benchmark/her2.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HER2 - #ifdef DOUBLE #define HER2 BLASFUNC(zher2) #else #define HER2 BLASFUNC(cher2) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -127,7 +54,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -169,16 +95,13 @@ int main(int argc, char *argv[]){ y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); - + begin(); HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; - - gettimeofday( &start, (struct timezone *)0); + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/her2k.c b/benchmark/her2k.c index 021873beb..d3cdce696 100644 --- a/benchmark/her2k.c +++ b/benchmark/her2k.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HER2K #ifdef DOUBLE @@ -40,72 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define HER2K BLASFUNC(cher2k) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -125,7 +53,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -169,13 +96,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/herk.c b/benchmark/herk.c index c09d35c1f..628dc2c11 100644 --- a/benchmark/herk.c +++ b/benchmark/herk.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HERK - #ifdef DOUBLE #define HERK BLASFUNC(zherk) #else #define HERK BLASFUNC(cherk) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *c; @@ -127,7 +54,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -167,18 +93,17 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); - } return 0; diff --git a/benchmark/hpmv.c b/benchmark/hpmv.c index b0157094e..907e2adc4 100644 --- a/benchmark/hpmv.c +++ b/benchmark/hpmv.c @@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef HPMV - #ifdef DOUBLE #define HPMV BLASFUNC(zhpmv) #else #define HPMV BLASFUNC(chpmv) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) { - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) { - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -124,7 +51,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -183,13 +109,13 @@ int main(int argc, char *argv[]){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/iamax.c b/benchmark/iamax.c index c87044ab4..15618cbcc 100644 --- a/benchmark/iamax.c +++ b/benchmark/iamax.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IAMAX @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -127,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -166,13 +94,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IAMAX (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/iamin.c b/benchmark/iamin.c index e7c8e59e4..a57638ecc 100644 --- a/benchmark/iamin.c +++ b/benchmark/iamin.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IAMIN @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -127,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -166,13 +94,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IAMIN (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/imax.c b/benchmark/imax.c index b56ef64ba..b96b17167 100644 --- a/benchmark/imax.c +++ b/benchmark/imax.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IMAX @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IMAX (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/imin.c b/benchmark/imin.c index 4a92c8bd0..095eacca9 100644 --- a/benchmark/imin.c +++ b/benchmark/imin.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef IMIN @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); IMIN (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/linpack.c b/benchmark/linpack.c index 661a44175..202035245 100644 --- a/benchmark/linpack.c +++ b/benchmark/linpack.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -72,71 +67,6 @@ double fabs(double); #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -151,7 +81,6 @@ int main(int argc, char *argv[]){ FLOAT maxerr; - struct timeval start, stop; double time1, time2; argc--;argv++; @@ -198,31 +127,31 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); GETRF (&m, &m, a, &m, ipiv, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); - gettimeofday( &start, (struct timezone *)0); + begin(); GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info) { fprintf(stderr, "Matrix is not singular .. %d\n", info); exit(1); } - time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time2 = getsec(); maxerr = 0.; diff --git a/benchmark/max.c b/benchmark/max.c index a19a386a2..301b943a5 100644 --- a/benchmark/max.c +++ b/benchmark/max.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef NAMAX @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); NAMAX (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/min.c b/benchmark/min.c index 4df8fb0fd..39df37a29 100644 --- a/benchmark/min.c +++ b/benchmark/min.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef NAMIN @@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -121,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -160,13 +88,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); NAMIN (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/nrm2.c b/benchmark/nrm2.c index 0f416621a..cd64d564a 100644 --- a/benchmark/nrm2.c +++ b/benchmark/nrm2.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef NRM2 @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x; @@ -127,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -166,13 +94,13 @@ int main(int argc, char *argv[]){ x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); NRM2 (&m, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/potrf.c b/benchmark/potrf.c index cb4c23bab..116d0cca5 100644 --- a/benchmark/potrf.c +++ b/benchmark/potrf.c @@ -36,12 +36,7 @@ /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" double fabs(double); @@ -86,37 +81,7 @@ double fabs(double); // extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info); // extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info); -#if defined(__WIN32__) || defined(__WIN64__) -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif int main(int argc, char *argv[]){ @@ -141,7 +106,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -217,18 +181,18 @@ int main(int argc, char *argv[]){ SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); - gettimeofday( &start, (struct timezone *)0); + begin(); POTRF(uplo[uplos], &m, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Potrf info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; if ( btest == 'S' ) @@ -240,17 +204,17 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); POTRS(uplo[uplos], &m, &m, b, &m, a, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Potrs info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; } @@ -258,18 +222,18 @@ int main(int argc, char *argv[]){ if ( btest == 'I' ) { - gettimeofday( &start, (struct timezone *)0); + begin(); POTRI(uplo[uplos], &m, b, &m, &info); - gettimeofday( &stop, (struct timezone *)0); + end(); if (info != 0) { fprintf(stderr, "Potri info = %d\n", info); exit(1); } - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; } diff --git a/benchmark/rot.c b/benchmark/rot.c index 69698988d..15b630e36 100644 --- a/benchmark/rot.c +++ b/benchmark/rot.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef ROT @@ -52,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -133,7 +63,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -179,13 +108,13 @@ int main(int argc, char *argv[]){ for (l=0; l -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef ROTM @@ -40,72 +35,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ROTM BLASFUNC(srotm) #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz) -{ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid = - shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT | 0600)) < 0) { - printf("Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf("Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -122,7 +51,7 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timeval start, stop; + double time1, timeg; argc--; @@ -188,14 +117,13 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - gettimeofday(&start, (struct timezone *)0); + begin(); ROTM(&m, x, &inc_x, y, &inc_y, param); - gettimeofday(&stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + - (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/scal.c b/benchmark/scal.c index 8bd62c77c..8de6cfd04 100644 --- a/benchmark/scal.c +++ b/benchmark/scal.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SCAL @@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -128,7 +57,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -174,13 +102,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SCAL (&m, alpha, x, &inc_x); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/spmv.c b/benchmark/spmv.c index cff504d3b..e4dcbf4ae 100644 --- a/benchmark/spmv.c +++ b/benchmark/spmv.c @@ -25,17 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SPMV - #ifndef COMPLEX #ifdef DOUBLE @@ -54,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -135,7 +63,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -193,13 +120,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/spr.c b/benchmark/spr.c index 5dcaa4f8b..2fc9994f8 100755 --- a/benchmark/spr.c +++ b/benchmark/spr.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SPR @@ -41,73 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SPR BLASFUNC(sspr) #endif - - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*c; @@ -129,7 +56,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -173,13 +99,13 @@ int main(int argc, char *argv[]){ c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SPR (&uplo, &m, alpha, c, &inc_x, a); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/spr2.c b/benchmark/spr2.c index a5f2791f7..8f194e83a 100755 --- a/benchmark/spr2.c +++ b/benchmark/spr2.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SPR2 @@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a,*b,*c; @@ -129,7 +58,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -182,13 +110,13 @@ int main(int argc, char *argv[]){ c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/swap.c b/benchmark/swap.c index 76d545995..64ebe5e9b 100644 --- a/benchmark/swap.c +++ b/benchmark/swap.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SWAP @@ -49,71 +44,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -128,7 +58,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -175,13 +104,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SWAP (&m, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/symm.c b/benchmark/symm.c index bb9849eb5..1c6d91d00 100644 --- a/benchmark/symm.c +++ b/benchmark/symm.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYMM @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -137,7 +66,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -181,13 +109,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/symv.c b/benchmark/symv.c index e4c892b5a..0a35aaef0 100644 --- a/benchmark/symv.c +++ b/benchmark/symv.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYMV @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x, *y; @@ -134,7 +63,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -192,13 +120,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/syr.c b/benchmark/syr.c index a9dd293e6..ebbf2bd3c 100644 --- a/benchmark/syr.c +++ b/benchmark/syr.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SYR @@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x,*a; @@ -124,7 +53,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -165,13 +93,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYR (&uplo, &m, alpha, x, &inc_x, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/syr2.c b/benchmark/syr2.c index 9efbca315..acbc86987 100644 --- a/benchmark/syr2.c +++ b/benchmark/syr2.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYR2 @@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYR2 BLASFUNC(ssyr2) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y, *a; @@ -125,7 +53,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -174,13 +101,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/syr2k.c b/benchmark/syr2k.c index a906559eb..3895c2861 100644 --- a/benchmark/syr2k.c +++ b/benchmark/syr2k.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef SYR2K @@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b, *c; @@ -137,7 +67,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -181,13 +110,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/syrk.c b/benchmark/syrk.c index 0fbb943f6..82606a21a 100644 --- a/benchmark/syrk.c +++ b/benchmark/syrk.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef SYRK @@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *c; @@ -137,7 +66,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -177,13 +105,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops\n", diff --git a/benchmark/tpmv.c b/benchmark/tpmv.c index fe9d07534..41f2e0fb8 100644 --- a/benchmark/tpmv.c +++ b/benchmark/tpmv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TPMV @@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -112,7 +73,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timespec start = { 0, 0 }, stop = { 0, 0 }; double time1, timeg; argc--;argv++; @@ -153,11 +113,11 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - clock_gettime(CLOCK_REALTIME, &start); + begin(); TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x); - clock_gettime(CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/tpsv.c b/benchmark/tpsv.c index 8472ac261..ebfa29692 100644 --- a/benchmark/tpsv.c +++ b/benchmark/tpsv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TPSV @@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -112,7 +73,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timespec start = { 0, 0 }, stop = { 0, 0 }; double time1, timeg; argc--;argv++; @@ -153,11 +113,11 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - clock_gettime(CLOCK_REALTIME, &start); + begin(); TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x); - clock_gettime(CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/trmm.c b/benchmark/trmm.c index 23af122b4..3ab9fc255 100644 --- a/benchmark/trmm.c +++ b/benchmark/trmm.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TRMM @@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -141,7 +71,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -180,13 +109,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); fprintf(stderr, " %10.2f MFlops %10.6f sec\n", diff --git a/benchmark/trmv.c b/benchmark/trmv.c index 46641b3e4..0e8088b54 100644 --- a/benchmark/trmv.c +++ b/benchmark/trmv.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TRMV @@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size) -{ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1) { - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]) { @@ -112,7 +73,6 @@ int main(int argc, char *argv[]) int to = 200; int step = 1; - struct timespec start = { 0, 0 }, stop = { 0, 0 }; double time1, timeg; argc--;argv++; @@ -153,11 +113,11 @@ int main(int argc, char *argv[]) } for (l = 0; l < loops; l++) { - clock_gettime(CLOCK_REALTIME, &start); + begin(); TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x); - clock_gettime(CLOCK_REALTIME, &stop); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/trsm.c b/benchmark/trsm.c index 17676946a..d2ebd7f54 100644 --- a/benchmark/trsm.c +++ b/benchmark/trsm.c @@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" +#include "bench.h" #undef TRSM @@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *b; @@ -151,7 +81,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1; argc--;argv++; @@ -196,13 +125,13 @@ int main(int argc, char *argv[]){ } } - gettimeofday( &start, (struct timezone *)0); + begin(); TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; } diff --git a/benchmark/trsv.c b/benchmark/trsv.c index 1734e2adb..66ac3a3c7 100644 --- a/benchmark/trsv.c +++ b/benchmark/trsv.c @@ -25,14 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include -#include "common.h" - +#include "bench.h" #undef GEMV #undef TRSV @@ -55,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *a, *x; @@ -133,7 +61,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timespec time_start, time_end; time_t seconds = 0; double time1,timeg; @@ -189,19 +116,13 @@ int main(int argc, char *argv[]){ for(l =0;l< loops;l++){ - clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start); - + begin(); TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x); - - clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end); - nanos = time_end.tv_nsec - time_start.tv_nsec; - seconds = time_end.tv_sec - time_start.tv_sec; - - time1 = seconds + nanos /1.e9; + end(); + time1 = getsec(); timeg += time1; } - timeg /= loops; long long muls = n*(n+1)/2.0; long long adds = (n - 1.0)*n/2.0; diff --git a/benchmark/zdot-intel.c b/benchmark/zdot-intel.c index ba1515365..06cdde13a 100644 --- a/benchmark/zdot-intel.c +++ b/benchmark/zdot-intel.c @@ -25,90 +25,18 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#define RETURN_BY_STACK 1 -#include "common.h" +#include "bench.h" +#define RETURN_BY_STACK 1 #undef DOT - #ifdef DOUBLE #define DOT BLASFUNC(zdotu) #else #define DOT BLASFUNC(cdotu) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -123,7 +51,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -170,13 +97,13 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); DOT (&result, &m, x, &inc_x, y, &inc_y ); - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/benchmark/zdot.c b/benchmark/zdot.c index fa624e859..23b3efcad 100644 --- a/benchmark/zdot.c +++ b/benchmark/zdot.c @@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. *****************************************************************************/ -#include -#include -#ifdef __CYGWIN32__ -#include -#endif -#include "common.h" - +#include "bench.h" #undef DOT @@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define DOT BLASFUNC(cdotu) #endif - -#if defined(__WIN32__) || defined(__WIN64__) - -#ifndef DELTA_EPOCH_IN_MICROSECS -#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL -#endif - -int gettimeofday(struct timeval *tv, void *tz){ - - FILETIME ft; - unsigned __int64 tmpres = 0; - static int tzflag; - - if (NULL != tv) - { - GetSystemTimeAsFileTime(&ft); - - tmpres |= ft.dwHighDateTime; - tmpres <<= 32; - tmpres |= ft.dwLowDateTime; - - /*converting file time to unix epoch*/ - tmpres /= 10; /*convert into microseconds*/ - tmpres -= DELTA_EPOCH_IN_MICROSECS; - tv->tv_sec = (long)(tmpres / 1000000UL); - tv->tv_usec = (long)(tmpres % 1000000UL); - } - - return 0; -} - -#endif - -#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0 - -static void *huge_malloc(BLASLONG size){ - int shmid; - void *address; - -#ifndef SHM_HUGETLB -#define SHM_HUGETLB 04000 -#endif - - if ((shmid =shmget(IPC_PRIVATE, - (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1), - SHM_HUGETLB | IPC_CREAT |0600)) < 0) { - printf( "Memory allocation failed(shmget).\n"); - exit(1); - } - - address = shmat(shmid, NULL, SHM_RND); - - if ((BLASLONG)address == -1){ - printf( "Memory allocation failed(shmat).\n"); - exit(1); - } - - shmctl(shmid, IPC_RMID, 0); - - return address; -} - -#define malloc huge_malloc - -#endif - int main(int argc, char *argv[]){ FLOAT *x, *y; @@ -122,7 +50,6 @@ int main(int argc, char *argv[]){ int to = 200; int step = 1; - struct timeval start, stop; double time1,timeg; argc--;argv++; @@ -169,15 +96,15 @@ int main(int argc, char *argv[]){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; } - gettimeofday( &start, (struct timezone *)0); + begin(); #ifdef RETURN_BY_STACK DOT (&result , &m, x, &inc_x, y, &inc_y ); #else result = DOT (&m, x, &inc_x, y, &inc_y ); #endif - gettimeofday( &stop, (struct timezone *)0); + end(); - time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; + time1 = getsec(); timeg += time1; diff --git a/cblas.h b/cblas.h index bf310bed2..da00d46d6 100644 --- a/cblas.h +++ b/cblas.h @@ -393,6 +393,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); /* dot production of BFLOAT16 input arrays, and output as float */ float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); +void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy); #ifdef __cplusplus } diff --git a/cmake/kernel.cmake b/cmake/kernel.cmake index 7d7f5ffda..0c102bae5 100644 --- a/cmake/kernel.cmake +++ b/cmake/kernel.cmake @@ -184,8 +184,8 @@ macro(SetDefaultL2) set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c) if (BUILD_BFLOAT16) - set(SBGEMVNKERNEL ../arm/gemv_n.c) - set(SBGEMVTKERNEL ../arm/gemv_t.c) + set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c) + set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c) set(SHGERKERNEL ../generic/ger.c) endif () endmacro () diff --git a/common_interface.h b/common_interface.h index 032877fe1..b9ebb2772 100644 --- a/common_interface.h +++ b/common_interface.h @@ -250,6 +250,8 @@ void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *, void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *, xdouble *, blasint *, xdouble *, blasint *); +void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float *, bfloat16 *, blasint *, + bfloat16 *, blasint *, float *, float *, blasint *); void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, float *, blasint *, float *, float *, blasint *); void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, diff --git a/common_level2.h b/common_level2.h index 640d4a073..9a5ebb4d9 100644 --- a/common_level2.h +++ b/common_level2.h @@ -44,6 +44,10 @@ extern "C" { #endif +int sbgemv_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); +int sbgemv_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); +int sbgemv_thread_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int); +int sbgemv_thread_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int); int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); diff --git a/common_macro.h b/common_macro.h index 54deed57c..c6ea1bfd9 100644 --- a/common_macro.h +++ b/common_macro.h @@ -646,10 +646,12 @@ #elif defined(BFLOAT16) -#define D_TO_BF16_K SBDTOBF16_K -#define D_BF16_TO_K DBF16TOD_K -#define S_TO_BF16_K SBSTOBF16_K -#define S_BF16_TO_K SBF16TOS_K +#define D_TO_BF16_K SBDTOBF16_K +#define D_BF16_TO_K DBF16TOD_K +#define S_TO_BF16_K SBSTOBF16_K +#define S_BF16_TO_K SBF16TOS_K +#define SBGEMV_N SBGEMV_N_K +#define SBGEMV_T SBGEMV_T_K #define AMAX_K SAMAX_K #define AMIN_K SAMIN_K diff --git a/common_param.h b/common_param.h index b50e4ff80..3e3ae06f8 100644 --- a/common_param.h +++ b/common_param.h @@ -78,8 +78,8 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG); int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); - int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); - int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); + int (*sbgemv_n) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); + int (*sbgemv_t) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG); int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); diff --git a/common_sb.h b/common_sb.h index 66968ab00..9976e812e 100644 --- a/common_sb.h +++ b/common_sb.h @@ -8,6 +8,8 @@ #define SBDTOBF16_K sbdtobf16_k #define SBF16TOS_K sbf16tos_k #define DBF16TOD_K dbf16tod_k +#define SBGEMV_N_K sbgemv_n +#define SBGEMV_T_K sbgemv_t #define SBGEMM_ONCOPY sbgemm_oncopy #define SBGEMM_OTCOPY sbgemm_otcopy @@ -29,6 +31,8 @@ #define SBDTOBF16_K gotoblas -> sbdtobf16_k #define SBF16TOS_K gotoblas -> sbf16tos_k #define DBF16TOD_K gotoblas -> dbf16tod_k +#define SBGEMV_N_K gotoblas -> sbgemv_n +#define SBGEMV_T_K gotoblas -> sbgemv_t #define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy #define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy diff --git a/cpuid_x86.c b/cpuid_x86.c index 728d459d1..84c12ff43 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -202,7 +202,7 @@ int support_avx(){ if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ xgetbv(0, &eax, &edx); if((eax & 6) == 6){ - ret=1; //OS support AVX + ret=1; //OS supports saving xmm and ymm registers (6 = (1<<1) | (1<<2)) } } return ret; @@ -219,8 +219,8 @@ int support_avx2(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 0) - ret=1; //OS supports AVX2 + if((ebx & (1<<5)) != 0) + ret=1; //CPU supports AVX2 return ret; #else return 0; @@ -235,14 +235,14 @@ int support_avx512(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & 32) != 32){ - ret=0; //OS does not even support AVX2 + if((ebx & (1<<5)) == 0){ + ret=0; //cpu does not have avx2 flag } - if((ebx & (1<<31)) != 0){ + if((ebx & (1<<31)) != 0){ //AVX512VL flag xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) - ret=1; //OS supports AVX512VL - } + ret=1; //OS supports saving zmm registers + } return ret; #else return 0; diff --git a/driver/level2/Makefile b/driver/level2/Makefile index 7212d6662..caecf4f97 100644 --- a/driver/level2/Makefile +++ b/driver/level2/Makefile @@ -413,7 +413,13 @@ XBLASOBJS += \ xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUN.$(SUFFIX) \ xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLN.$(SUFFIX) \ xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \ - xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) \ + xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) + +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemv_thread_n$(TSUFFIX).$(SUFFIX) \ + sbgemv_thread_t$(TSUFFIX).$(SUFFIX) +endif endif @@ -3693,4 +3699,12 @@ xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) +ifeq ($(BUILD_BFLOAT16),1) +sbgemv_thread_n.$(SUFFIX) sbgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F) +sbgemv_thread_t.$(SUFFIX) sbgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h + $(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F) +endif + + include ../../Makefile.tail diff --git a/driver/level2/sbgemv_thread.c b/driver/level2/sbgemv_thread.c new file mode 100644 index 000000000..534c60f95 --- /dev/null +++ b/driver/level2/sbgemv_thread.c @@ -0,0 +1,149 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include +#include "common.h" + +#ifndef TRANSA +#define SBGEMV SBGEMV_N +#else +#define SBGEMV SBGEMV_T +#endif + +static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *dummy2, BLASLONG dummy3){ + + bfloat16 *a, *x; + float *y; + BLASLONG lda, incx, incy; + BLASLONG m_from, m_to, n_from, n_to; + + a = (bfloat16 *)args->a; + x = (bfloat16 *)args->b; + y = (float *)args->c; + + lda = args->lda; + incx = args->ldb; + incy = args->ldc; + +#ifndef TRANSA // N + m_from = *(range_m + 0); + m_to = *(range_m + 1); + n_from = 0; + n_to = args -> n; + a += m_from; + y += m_from * incy; +#else // T + m_from = 0; + m_to = args->m; + n_from = *(range_n + 0); + n_to = *(range_n + 1); + a += n_from * lda; + y += n_from * incy; +#endif + + SBGEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy); + + return 0; +} + +int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy, int threads) +{ + blas_arg_t args; + blas_queue_t queue[MAX_CPU_NUMBER]; + BLASLONG range[MAX_CPU_NUMBER + 1]; + +#ifndef TRANSA + BLASLONG width_for_split = m; +#else + BLASLONG width_for_split = n; +#endif + + BLASLONG BLOCK_WIDTH = width_for_split/threads; + + int mode = BLAS_BFLOAT16 | BLAS_REAL; + + args.m = m; + args.n = n; + args.a = (void *)a; + args.b = (void *)x; + args.c = (void *)y; + args.lda = lda; + args.ldb = incx; + args.ldc = incy; + args.alpha = (void *)α + args.beta = (void *)β + + range[0] = 0; + + int thread_idx; + + for (thread_idx=0; thread_idxsb=sb; } } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 21d2c7948..58f4d8b59 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -330,8 +330,8 @@ int support_avx2(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) != 0) - ret=1; //OS supports AVX2 + if((ebx & (1<<5)) != 0) + ret=1; //AVX2 flag is set return ret; #else return 0; @@ -346,13 +346,13 @@ int support_avx512(){ if (!support_avx()) return 0; cpuid(7, &eax, &ebx, &ecx, &edx); - if((ebx & (1<<7)) == 0){ - ret=0; //OS does not even support AVX2 + if((ebx & (1<<5)) == 0){ + ret=0; //cpu does not have avx2 flag } - if((ebx & (1u<<31)) != 0){ + if((ebx & (1<<31)) != 0){ //AVX512VL flag is set xgetbv(0, &eax, &edx); if((eax & 0xe0) == 0xe0) - ret=1; //OS supports AVX512VL + ret=1; //OS supports saving zmm register } return ret; #else diff --git a/driver/others/dynamic_arm64.c b/driver/others/dynamic_arm64.c index be22b247c..007a221db 100644 --- a/driver/others/dynamic_arm64.c +++ b/driver/others/dynamic_arm64.c @@ -139,19 +139,30 @@ static gotoblas_t *force_coretype(char *coretype) { static gotoblas_t *get_coretype(void) { int implementer, variant, part, arch, revision, midr_el1; + char coremsg[128]; + +#if (!defined OS_LINUX && !defined OS_ANDROID) + return NULL; +#endif -#if (defined OS_LINUX || defined OS_ANDROID) if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { - char coremsg[128]; +#ifdef __linux + FILE *infile; + char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL; + p = (char *) NULL ; + infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r"); + if (!infile) return NULL; + fgets(buffer, sizeof(buffer), infile); + midr_el1=strtoul(buffer,NULL,16); + fclose(infile); +#else snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); openblas_warning(1, coremsg); return NULL; - } -#else - return NULL; #endif - - get_cpu_ftr(MIDR_EL1, midr_el1); + } else { + get_cpu_ftr(MIDR_EL1, midr_el1); + } /* * MIDR_EL1 * @@ -219,6 +230,9 @@ static gotoblas_t *get_coretype(void) { return &gotoblas_FALKOR; } break; + default: + snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part); + openblas_warning(1, coremsg); } return NULL; } diff --git a/driver/others/memory.c b/driver/others/memory.c index ba2bb55b9..f0521ab2d 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1767,11 +1767,11 @@ int get_num_procs(void); int get_num_procs(void) { static int nums = 0; + +#if defined(__GLIBC_PREREQ) cpu_set_t cpuset,*cpusetp; size_t size; int ret; - -#if defined(__GLIBC_PREREQ) #if !__GLIBC_PREREQ(2, 7) int i; #if !__GLIBC_PREREQ(2, 6) diff --git a/exports/gensymbol b/exports/gensymbol index d5ec45fad..857a17a9e 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -51,7 +51,7 @@ zgeadd, dzsum); @blasobjs = (lsame, xerbla); -@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); +@bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @cblasobjsc = ( cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, @@ -94,7 +94,7 @@ @cblasobjs = ( cblas_xerbla ); -@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); +@bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); @exblasobjs = ( qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, @@ -786,22 +786,22 @@ zpotri, zlamswlq, zgemlq, ); - @lapackobjs2 = (@lapackobjs2, - sladiv1, - dladiv1, + @lapackobjs2s = (@lapackobjs2s, + sladiv1); + @lapackobjs2d = (@lapackobjs2d, + dladiv1); + @lapackobjs = (@lapackobjs, iparam2stage, - # functions added for lapack-3.8.0 - ilaenv2stage, ); # functions added for lapack-3.9.0 @lapackobjs2c = (@lapackobjs2c, cgesvdq, - cungtsqr, - dcombssq, + cungtsqr ); @lapackobjs2d = (@lapackobjs2d, + dcombssq, dgesvdq, dorgtsqr, ); diff --git a/getarch.c b/getarch.c index 3f1448305..ab90f36d9 100644 --- a/getarch.c +++ b/getarch.c @@ -1405,8 +1405,41 @@ int main(int argc, char *argv[]){ printf("NUM_CORES=%d\n", get_num_cores()); -#if defined(__arm__) && !defined(FORCE) +#if defined(__arm__) +#if !defined(FORCE) + fprintf(stderr,"get features!\n"); get_features(); +#else + fprintf(stderr,"split archconfig!\n"); + sprintf(buffer, "%s", ARCHCONFIG); + + p = &buffer[0]; + + while (*p) { + if ((*p == '-') && (*(p + 1) == 'D')) { + p += 2; + if (*p != 'H') { + while( (*p != ' ') && (*p != '-') && (*p != '\0') && (*p != '\n')) {p++; } + if (*p == '-') continue; + } + while ((*p != ' ') && (*p != '\0')) { + + if (*p == '=') { + printf("="); + p ++; + while ((*p != ' ') && (*p != '\0')) { + printf("%c", *p); + p ++; + } + } else { + printf("%c", *p); + p ++; + if ((*p == ' ') || (*p =='\0')) printf("=1\n"); + } + } + } else p ++; + } +#endif #endif diff --git a/interface/Makefile b/interface/Makefile index 6b247b49f..7b0bf1792 100644 --- a/interface/Makefile +++ b/interface/Makefile @@ -48,6 +48,7 @@ SBLAS3OBJS = \ ifeq ($(BUILD_BFLOAT16),1) SBBLAS1OBJS = sbdot.$(SUFFIX) +SBBLAS2OBJS = sbgemv.$(SUFFIX) SBBLAS3OBJS = sbgemm.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) endif @@ -284,6 +285,7 @@ CSBLAS3OBJS = \ ifeq ($(BUILD_BFLOAT16),1) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) +CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX) CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) endif @@ -382,6 +384,7 @@ SBLAS1OBJS += $(CSBLAS1OBJS) SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS3OBJS += $(CSBLAS3OBJS) SBBLAS1OBJS += $(CSBBLAS1OBJS) +SBBLAS2OBJS += $(CSBBLAS2OBJS) SBBLAS3OBJS += $(CSBBLAS3OBJS) DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS2OBJS += $(CDBLAS2OBJS) @@ -399,7 +402,7 @@ CBAUXOBJS += $(CXERBLAOBJ) endif SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) -SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS) +SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) @@ -507,7 +510,7 @@ ifneq ($(BUILD_COMPLEX16),1) endif FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) -$(info FUNCOBJS = {[$(FUNCOBJS)]} ) + ifdef EXPRECISION FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) endif @@ -538,7 +541,7 @@ clean :: level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ -level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) +level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) @@ -929,6 +932,11 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) +ifeq ($(BUILD_BFLOAT16),1) +sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c + $(CC) $(CFLAGS) -c $< -o $(@F) +endif + ifndef USE_NETLIB_GEMV sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c $(CC) -c $(CFLAGS) -o $(@F) $< @@ -1656,6 +1664,11 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) +ifeq ($(BUILD_BFLOAT16),1) +cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c + $(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F) +endif + cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< diff --git a/interface/gemv.c b/interface/gemv.c index c9d52cd69..d5d739fb1 100644 --- a/interface/gemv.c +++ b/interface/gemv.c @@ -191,7 +191,6 @@ void CNAME(enum CBLAS_ORDER order, } #endif - //printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta); if ((m==0) || (n==0)) return; lenx = n; diff --git a/interface/sbgemv.c b/interface/sbgemv.c new file mode 100644 index 000000000..89debe82d --- /dev/null +++ b/interface/sbgemv.c @@ -0,0 +1,210 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include +#include "common.h" +#include "l1param.h" +#ifdef FUNCTION_PROFILE +#include "functable.h" +#endif + +#define ERROR_NAME "SBGEMV " + +#ifdef SMP +static int (*sbgemv_thread[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG, int) = { + sbgemv_thread_n, sbgemv_thread_t, +}; +#endif + +#ifndef CBLAS + +void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasint *LDA, bfloat16 *x, blasint *INCX, float *BETA, float *y, blasint *INCY) +{ + char trans = *TRANS; + blasint m = *M; + blasint n = *N; + blasint lda = *LDA; + blasint incx = *INCX; + blasint incy = *INCY; + float alpha = *ALPHA; + float beta = *BETA; +#ifdef SMP + int nthreads; +#endif + + int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = { + SBGEMV_N, SBGEMV_T, + }; + + blasint info; + blasint lenx, leny; + blasint i; + + PRINT_DEBUG_NAME; + + TOUPPER(trans); + + info = 0; + + i = -1; + + if (trans == 'N') {i = 0;} + if (trans == 'T') {i = 1;} + if (trans == 'R') {i = 0;} + if (trans == 'C') {i = 1;} + + if (incy == 0) {info = 11;} + if (incx == 0) {info = 8;} + if (lda < MAX(1, m)) {info = 6;} + if (n < 0) {info = 3;} + if (m < 0) {info = 2;} + if (i < 0) {info = 1;} + + trans = i; + + if (info != 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#else + +void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, float alpha, bfloat16 *a, blasint lda, bfloat16 *x, blasint incx, float beta, float *y, blasint incy) +{ + blasint lenx, leny; + int trans; + blasint info, t; +#ifdef SMP + int nthreads; +#endif + + int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = { + SBGEMV_N, SBGEMV_T, + }; + + PRINT_DEBUG_CNAME; + + trans = -1; + info = 0; + + if (order == CblasColMajor) { // Column Major + if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) { + trans = 0; + } else if (TransA == CblasTrans || TransA == CblasConjTrans) { + trans = 1; + } + } else { // Row Major + if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) { + trans = 1; + } else if (TransA == CblasTrans || TransA == CblasConjTrans) { + trans = 0; + } + + t = n; + n = m; + m = t; + } + + info = -1; + + if (incy == 0) {info = 11;} + if (incx == 0) {info = 8;} + if (lda < MAX(1, m)) {info = 6;} + if (n < 0) {info = 3;} + if (m < 0) {info = 2;} + if (trans < 0) {info = 1;} + + if (info >= 0) { + BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME)); + return; + } + +#endif + + if ((m==0) || (n==0)) return; + + if (trans) { + lenx = m; + leny = n; + } else { + lenx = n; + leny = m; + } + + if (alpha == ZERO) { + if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0); + return; + } + + IDEBUG_START; + FUNCTION_PROFILE_START(); + + if (incx < 0) {x -= (lenx - 1) * incx;} + if (incy < 0) {y -= (leny - 1) * incy;} + +#ifdef SMP + int thread_thres_row = 20480; + if (trans) { + if (n <= thread_thres_row) { + nthreads = 1; + } else { + nthreads = num_cpu_avail(1); + } + } else { + if (m <= thread_thres_row) { + nthreads = 1; + } else { + nthreads = num_cpu_avail(1); + } + } + + + if (nthreads == 1) { +#endif + (sbgemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy); +#ifdef SMP + } else { + (sbgemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads); + } +#endif + + FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n); + IDEBUG_END; + + return; +} diff --git a/kernel/Makefile b/kernel/Makefile index e52781c6d..e811ed43d 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -12,11 +12,6 @@ ifdef HAVE_SSSE3 CFLAGS += -mssse3 endif -ifeq ($(C_COMPILER), GCC) -GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9) -GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10) -endif - ifeq ($(ARCH), power) ifeq ($(C_COMPILER), CLANG) override CFLAGS += -fno-integrated-as @@ -26,20 +21,14 @@ endif AVX2OPT = ifeq ($(C_COMPILER), GCC) # AVX2 support was added in 4.7.0 -GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) -GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) -GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7) -GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) +GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) AVX2OPT = -mavx2 endif endif ifeq ($(C_COMPILER), CLANG) # Any clang posing as gcc 4.2 should be new enough (3.4 or later) - GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) - GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5) - GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2) - GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7) + GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) AVX2OPT = -mavx2 endif diff --git a/kernel/Makefile.L2 b/kernel/Makefile.L2 index 79399c342..888a9b959 100644 --- a/kernel/Makefile.L2 +++ b/kernel/Makefile.L2 @@ -48,6 +48,16 @@ ifndef XGEMVTKERNEL XGEMVTKERNEL = zgemv_t.S endif +ifeq ($(BUILD_BFLOAT16),1) +ifndef SBGEMVNKERNEL +SBGEMVNKERNEL = ../x86_64/sbgemv_n.c +endif + +ifndef SBGEMVTKERNEL +SBGEMVTKERNEL = ../x86_64/sbgemv_t.c +endif +endif + ### GER ### ifndef SGERKERNEL @@ -234,6 +244,12 @@ XBLASOBJS += \ xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) +ifeq ($(BUILD_BFLOAT16),1) +SBBLASOBJS += \ + sbgemv_n$(TSUFFIX).$(SUFFIX) \ + sbgemv_t$(TSUFFIX).$(SUFFIX) +endif + ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ @@ -483,4 +499,10 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ +ifeq ($(BUILD_BFLOAT16),1) +$(KDIR)sbgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVNKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +$(KDIR)sbgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVTKERNEL) + $(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@ +endif diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 86df7e3a2..b4c7a5e41 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -141,13 +141,9 @@ DASUMKERNEL = dasum.c CASUMKERNEL = casum.c ZASUMKERNEL = zasum.c # -SAXPYKERNEL = saxpy.c +SAXPYKERNEL = saxpy_power10.c DAXPYKERNEL = daxpy_power10.c -ifneq ($(GCCVERSIONGTEQ9),1) -CAXPYKERNEL = caxpy_power9.S -else -CAXPYKERNEL = caxpy.c -endif +CAXPYKERNEL = caxpy_power10.c ZAXPYKERNEL = zaxpy_power10.c # SCOPYKERNEL = scopy_power10.c diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c new file mode 100644 index 000000000..0d13416b3 --- /dev/null +++ b/kernel/power/caxpy_microk_power10.c @@ -0,0 +1,188 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 +static void caxpy_kernel_8 (long n, float *x, float *y, + float alpha_r, float alpha_i) +{ +#if !defined(CONJ) + static const float mvec[4] = { -1.0, 1.0, -1.0, 1.0 }; +#else + static const float mvec[4] = { 1.0, -1.0, 1.0, -1.0 }; +#endif + const float *mvecp = mvec; + /* We have to load reverse mask for big endian. */ + /* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */ + + __vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4}; + long ytmp; + + __asm__ + ( + "xscvdpspn 32, %7 \n\t" + "xscvdpspn 33, %8 \n\t" + "xxspltw 32, 32, 0 \n\t" + "xxspltw 33, 33, 0 \n\t" + "lxvd2x 36, 0, %9 \n\t" // mvec + +#if !defined(CONJ) + "xvmulsp 33, 33, 36 \n\t" // alpha_i * mvec +#else + "xvmulsp 32, 32, 36 \n\t" // alpha_r * mvec +#endif + "mr %4, %3 \n\t" + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "lxvp 40, 0(%2) \n\t" // x0 + "lxvp 42, 32(%2) \n\t" // x2 + "lxvp 48, 0(%3) \n\t" // y0 + "lxvp 50, 32(%3) \n\t" // y2 + + "xxperm 52, 40, %x10 \n\t" // exchange real and imag part + "xxperm 53, 41, %x10 \n\t" // exchange real and imag part + "xxperm 54, 42, %x10 \n\t" // exchange real and imag part + "xxperm 55, 43, %x10 \n\t" // exchange real and imag part + + "lxvp 44, 64(%2) \n\t" // x4 + "lxvp 46, 96(%2) \n\t" // x6 + "lxvp 34, 64(%3) \n\t" // y4 + "lxvp 38, 96(%3) \n\t" // y6 + + "xxperm 56, 44, %x10 \n\t" // exchange real and imag part + "xxperm 57, 45, %x10 \n\t" // exchange real and imag part + "xxperm 58, 46, %x10 \n\t" // exchange real and imag part + "xxperm 59, 47, %x10 \n\t" // exchange real and imag part + + "addi %2, %2, 128 \n\t" + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddasp 49, 41, 32 \n\t" + "lxvp 40, 0(%2) \n\t" // x0 + "xvmaddasp 50, 42, 32 \n\t" + "xvmaddasp 51, 43, 32 \n\t" + "lxvp 42, 32(%2) \n\t" // x2 + + "xvmaddasp 34, 44, 32 \n\t" + "xvmaddasp 35, 45, 32 \n\t" + "lxvp 44, 64(%2) \n\t" // x4 + "xvmaddasp 38, 46, 32 \n\t" + "xvmaddasp 39, 47, 32 \n\t" + "lxvp 46, 96(%2) \n\t" // x6 + + "xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "addi %2, %2, 128 \n\t" + "xvmaddasp 49, 53, 33 \n\t" + "xvmaddasp 50, 54, 33 \n\t" + "xvmaddasp 51, 55, 33 \n\t" + + "xvmaddasp 34, 56, 33 \n\t" + "xvmaddasp 35, 57, 33 \n\t" + "xvmaddasp 38, 58, 33 \n\t" + "xvmaddasp 39, 59, 33 \n\t" + + "stxvp 48, 0(%4) \n\t" + "stxvp 50, 32(%4) \n\t" + "stxvp 34, 64(%4) \n\t" + "stxvp 38, 96(%4) \n\t" + + "addi %4, %4, 128 \n\t" + "xxperm 52, 40, %x10 \n\t" // exchange real and imag part + "xxperm 53, 41, %x10 \n\t" // exchange real and imag part + + "lxvp 48, 0(%3) \n\t" // y0 + "xxperm 54, 42, %x10 \n\t" // exchange real and imag part + "xxperm 55, 43, %x10 \n\t" // exchange real and imag part + "lxvp 50, 32(%3) \n\t" // y2 + + "xxperm 56, 44, %x10 \n\t" // exchange real and imag part + "xxperm 57, 45, %x10 \n\t" // exchange real and imag part + "lxvp 34, 64(%3) \n\t" // y4 + "xxperm 58, 46, %x10 \n\t" // exchange real and imag part + "xxperm 59, 47, %x10 \n\t" // exchange real and imag part + "lxvp 38, 96(%3) \n\t" // y6 + + "addi %3, %3, 128 \n\t" + + "addic. %1, %1, -16 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + "xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i + "xvmaddasp 49, 41, 32 \n\t" + "xvmaddasp 50, 42, 32 \n\t" + "xvmaddasp 51, 43, 32 \n\t" + + "xvmaddasp 34, 44, 32 \n\t" + "xvmaddasp 35, 45, 32 \n\t" + "xvmaddasp 38, 46, 32 \n\t" + "xvmaddasp 39, 47, 32 \n\t" + + "xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r + "xvmaddasp 49, 53, 33 \n\t" + "xvmaddasp 50, 54, 33 \n\t" + "xvmaddasp 51, 55, 33 \n\t" + + "xvmaddasp 34, 56, 33 \n\t" + "xvmaddasp 35, 57, 33 \n\t" + "xvmaddasp 38, 58, 33 \n\t" + "xvmaddasp 39, 59, 33 \n\t" + + "stxvp 48, 0(%4) \n\t" + "stxvp 50, 32(%4) \n\t" + "stxvp 34, 64(%4) \n\t" + "stxvp 38, 96(%4) \n\t" + + "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y), // 3 + "=b" (ytmp) // 4 + : + "m" (*x), + "m" (*mvecp), + "d" (alpha_r), // 7 + "d" (alpha_i), // 8 + "4" (mvecp), // 9 + "wa" (mask) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59" + ); +} diff --git a/kernel/power/caxpy_power10.c b/kernel/power/caxpy_power10.c new file mode 100644 index 000000000..14b8cda67 --- /dev/null +++ b/kernel/power/caxpy_power10.c @@ -0,0 +1,126 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" + + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "caxpy_microk_power10.c" +#endif + + +#ifndef HAVE_KERNEL_8 + +static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i) +{ + BLASLONG register i = 0; + BLASLONG register ix = 0; + + + + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ; + y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; + y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ; + y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ; +#endif + + ix+=4 ; + i+=2 ; + + } + +} + +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -16; + + if ( n1 ) + { + caxpy_kernel_8 (n1, x, y, da_r, da_i); + ix = 2 * n1; + } + i = n1; + while(i < n) + { +#if !defined(CONJ) + y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + i++ ; + ix += 2; + + } + return(0); + + + } + + inc_x *=2; + inc_y *=2; + + while(i < n) + { + +#if !defined(CONJ) + y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ; + y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ; +#else + y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ; + y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ; +#endif + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/power/saxpy_microk_power10.c b/kernel/power/saxpy_microk_power10.c new file mode 100644 index 000000000..6ede1dcdd --- /dev/null +++ b/kernel/power/saxpy_microk_power10.c @@ -0,0 +1,181 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#define HAVE_KERNEL_8 1 + +static void saxpy_kernel_64(long n, float *x, float *y, float alpha) +{ + __vector float t0 = {alpha, alpha,alpha, alpha}; + + __asm__ + ( + + "dcbt 0, %2 \n\t" + "dcbt 0, %3 \n\t" + + "lxvp 32, 0(%2) \n\t" + "lxvp 34, 32(%2) \n\t" + "lxvp 40, 64(%2) \n\t" + "lxvp 42, 96(%2) \n\t" + "lxvp 48, 128(%2) \n\t" + "lxvp 50, 160(%2) \n\t" + "lxvp 52, 192(%2) \n\t" + "lxvp 54, 224(%2) \n\t" + + "lxvp 36, 0(%3) \n\t" + "lxvp 38, 32(%3) \n\t" + "lxvp 44, 64(%3) \n\t" + "lxvp 46, 96(%3) \n\t" + "lxvp 56, 128(%3) \n\t" + "lxvp 58, 160(%3) \n\t" + "lxvp 60, 192(%3) \n\t" + "lxvp 62, 224(%3) \n\t" + + "addi %2, %2, 256 \n\t" + + "addic. %1, %1, -64 \n\t" + "ble two%= \n\t" + + ".align 5 \n" + "one%=: \n\t" + + "xvmaddasp 36, 32, %x4 \n\t" + "xvmaddasp 37, 33, %x4 \n\t" + + "lxvp 32, 0(%2) \n\t" + "stxvp 36, 0(%3) \n\t" + + "xvmaddasp 38, 34, %x4 \n\t" + "xvmaddasp 39, 35, %x4 \n\t" + + "lxvp 34, 32(%2) \n\t" + "stxvp 38, 32(%3) \n\t" + + "lxvp 36, 256(%3) \n\t" + "lxvp 38, 288(%3) \n\t" + + "xvmaddasp 44, 40, %x4 \n\t" + "xvmaddasp 45, 41, %x4 \n\t" + + "lxvp 40, 64(%2) \n\t" + "stxvp 44, 64(%3) \n\t" + + "xvmaddasp 46, 42, %x4 \n\t" + "xvmaddasp 47, 43, %x4 \n\t" + + "lxvp 42, 96(%2) \n\t" + "stxvp 46, 96(%3) \n\t" + + "lxvp 44, 320(%3) \n\t" + "lxvp 46, 352(%3) \n\t" + + "xvmaddasp 56, 48, %x4 \n\t" + "xvmaddasp 57, 49, %x4 \n\t" + + "lxvp 48, 128(%2) \n\t" + "stxvp 56, 128(%3) \n\t" + + "xvmaddasp 58, 50, %x4 \n\t" + "xvmaddasp 59, 51, %x4 \n\t" + + "lxvp 50, 160(%2) \n\t" + "stxvp 58, 160(%3) \n\t" + + "lxvp 56, 384(%3) \n\t" + "lxvp 58, 416(%3) \n\t" + + "xvmaddasp 60, 52, %x4 \n\t" + "xvmaddasp 61, 53, %x4 \n\t" + + "lxvp 52, 192(%2) \n\t" + "stxvp 60, 192(%3) \n\t" + + "xvmaddasp 62, 54, %x4 \n\t" + "xvmaddasp 63, 55, %x4 \n\t" + + "lxvp 54, 224(%2) \n\t" + "stxvp 62, 224(%3) \n\t" + + "lxvp 60, 448(%3) \n\t" + "lxvp 62, 480(%3) \n\t" + + "addi %2, %2, 256 \n\t" + "addi %3, %3, 256 \n\t" + + "addic. %1, %1, -64 \n\t" + "bgt one%= \n" + + "two%=: \n\t" + + "xvmaddasp 36, 32, %x4 \n\t" + "xvmaddasp 37, 33, %x4 \n\t" + "xvmaddasp 38, 34, %x4 \n\t" + "xvmaddasp 39, 35, %x4 \n\t" + + "xvmaddasp 44, 40, %x4 \n\t" + "xvmaddasp 45, 41, %x4 \n\t" + "xvmaddasp 46, 42, %x4 \n\t" + "xvmaddasp 47, 43, %x4 \n\t" + + "xvmaddasp 56, 48, %x4 \n\t" + "xvmaddasp 57, 49, %x4 \n\t" + "xvmaddasp 58, 50, %x4 \n\t" + "xvmaddasp 59, 51, %x4 \n\t" + + "xvmaddasp 60, 52, %x4 \n\t" + "xvmaddasp 61, 53, %x4 \n\t" + "xvmaddasp 62, 54, %x4 \n\t" + "xvmaddasp 63, 55, %x4 \n\t" + "stxvp 36, 0(%3) \n\t" + "stxvp 38, 32(%3) \n\t" + "stxvp 44, 64(%3) \n\t" + "stxvp 46, 96(%3) \n\t" + "stxvp 56, 128(%3) \n\t" + "stxvp 58, 160(%3) \n\t" + "stxvp 60, 192(%3) \n\t" + "stxvp 62, 224(%3) \n\t" + + "#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n" + : + "+m" (*y), + "+r" (n), // 1 + "+b" (x), // 2 + "+b" (y) // 3 + : + "wa" (t0), // 4 + "m" (*x) + : + "cr0", + "vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39", + "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47", + "vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55", + "vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63" + ); + +} + + diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c new file mode 100644 index 000000000..8c7c22390 --- /dev/null +++ b/kernel/power/saxpy_power10.c @@ -0,0 +1,119 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) +#include "saxpy_microk_power10.c" +#endif + +#ifndef HAVE_KERNEL_8 +static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha) +{ + BLASLONG register i = 0; + + while(i < n) + { + y[i] += alpha * x[i]; + y[i+1] += alpha * x[i+1]; + y[i+2] += alpha * x[i+2]; + y[i+3] += alpha * x[i+3]; + y[i+4] += alpha * x[i+4]; + y[i+5] += alpha * x[i+5]; + y[i+6] += alpha * x[i+6]; + y[i+7] += alpha * x[i+7]; + i+=8 ; + + } + +} +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + + if ( n <= 0 ) return(0); + + if ( (inc_x == 1) && (inc_y == 1) ) + { + + BLASLONG n1 = n & -64; + + if ( n1 ) + saxpy_kernel_64(n1, x, y, da); + + i = n1; + while(i < n) + { + + y[i] += da * x[i] ; + i++ ; + + } + return(0); + + + } + + BLASLONG n1 = n & -4; + + while(i < n1) + { + + FLOAT m1 = da * x[ix] ; + FLOAT m2 = da * x[ix+inc_x] ; + FLOAT m3 = da * x[ix+2*inc_x] ; + FLOAT m4 = da * x[ix+3*inc_x] ; + + y[iy] += m1 ; + y[iy+inc_y] += m2 ; + y[iy+2*inc_y] += m3 ; + y[iy+3*inc_y] += m4 ; + + ix += inc_x*4 ; + iy += inc_y*4 ; + i+=4 ; + + } + + while(i < n) + { + + y[iy] += da * x[ix] ; + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 849a4194a..d0317a745 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -69,7 +69,7 @@ gotoblas_t TABLE_NAME = { snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, dsdot_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, - sgemv_nTS, sgemv_tTS, sger_kTS, + sbgemv_nTS, sbgemv_tTS, sger_kTS, ssymv_LTS, ssymv_UTS, sbgemm_kernelTS, sbgemm_betaTS, diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 855e1ff8c..b92f480e9 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -384,6 +384,14 @@ endif GEMVDEP = ../l2param.h +ifndef SBGEMVNKERNEL +SBGEMVNKERNEL = sbgemv_n.c +endif + +ifndef SBGEMVTKERNEL +SBGEMVTKERNEL = sbgemv_t.c +endif + ifndef SGEMVNKERNEL SGEMVNKERNEL = sgemv_n.c endif diff --git a/kernel/x86_64/bf16_common_macros.h b/kernel/x86_64/bf16_common_macros.h new file mode 100644 index 000000000..1014ecc4d --- /dev/null +++ b/kernel/x86_64/bf16_common_macros.h @@ -0,0 +1,795 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#ifndef __BF16_COMMON_MACROS +#define __BF16_COMMON_MACROS + +#include + +#define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512) \ + reg256##_0 = _mm512_castps512_ps256(reg512##_0); \ + reg256##_1 = _mm512_castps512_ps256(reg512##_1); + + +#define BF16_MATRIX_LOAD_8x32(regArray, a, lda, idx_m, idx_n) \ + regArray##_0 = _mm512_loadu_si512(&a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_loadu_si512(&a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm512_loadu_si512(&a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm512_loadu_si512(&a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm512_loadu_si512(&a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm512_loadu_si512(&a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm512_loadu_si512(&a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm512_loadu_si512(&a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \ + regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \ + regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \ + regArray = _mm512_loadu_si512(&a[idx_m*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x32(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x16(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x8(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \ + regArray##_4 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_5 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \ + regArray##_6 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_7 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_4x32(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_4x16(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \ + regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_8x32_2(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \ + regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+8)*lda + idx_n]); \ + regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+10)*lda + idx_n]); \ + regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+12)*lda + idx_n]); \ + regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+14)*lda + idx_n]); + + +#define BF16_MATRIX_MASKZ_LOAD_4x32_2(regArray, a, lda, idx_m, idx_n, mask) \ + regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \ + regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \ + regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \ + regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); + +#define BF16_MATRIX_MASKZ_LOAD_1x32(regArray, a, lda, idx_m, idx_n, mask) \ + regArray = _mm512_maskz_loadu_epi16(mask, &a[idx_m*lda + idx_n]); + +#define BF16_VECTOR_LOAD_1x32(reg, x, idx_n) \ + reg = _mm512_loadu_si512(x + idx_n); + + +#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \ + reg = _mm256_loadu_si256(x + idx_n); + + +#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \ + reg = _mm_loadu_si128(x + idx_n); + + +#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \ + reg = _mm512_maskz_loadu_epi16(mask, x + idx_n); + + +#define BF16_VECTOR_MASKZ_LOAD_1x16(reg, x, idx_n, mask) \ + reg = _mm256_maskz_loadu_epi16(mask, x + idx_n); + + +#define BF16_VECTOR_MASKZ_LOAD_1x8(reg, x, idx_n, mask) \ + reg = _mm_maskz_loadu_epi16(mask, x + idx_n); + + +/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27 + |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11|e16|e17|f16|f17|e18|e19|f18|f19|e24|e25|f24|f25|e26|e27|f26|f27 + |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11|g16|g17|h16|h17|g18|g19|h18|h19|g24|g25|h24|h25|g26|g27|h26|h27 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31 + |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15|e20|e21|f20|f21|e22|e23|f22|f23|e28|e29|f28|f29|e30|e31|f30|f31 + |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15|g20|g21|h20|h21|g22|g23|h22|h23|g28|g29|h28|h29|g30|g31|h30|h31 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27 + |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 |e16|e17|f16|f17|g16|g17|h16|h17|e24|e25|f24|f25|g24|g25|h24|h25 + |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11|e18|e19|f18|f19|g18|g19|h18|h19|e26|e27|f26|f27|g26|g27|h26|h27 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31 + |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13|e20|e21|f20|f21|g20|g21|h20|h21|e28|e29|f28|f29|g28|g29|h28|h29 + |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15|e22|e23|f22|f23|g22|g23|h22|h23|e30|e31|f30|f31|g30|g31|h30|h31 +*/ +#define BF16_INTERLEAVE_8x32(regArray) \ + regArray##_8 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_9 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_10 = _mm512_unpacklo_epi32(regArray##_4, regArray##_5); \ + regArray##_11 = _mm512_unpacklo_epi32(regArray##_6, regArray##_7); \ + regArray##_12 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_13 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \ + regArray##_14 = _mm512_unpackhi_epi32(regArray##_4, regArray##_5); \ + regArray##_15 = _mm512_unpackhi_epi32(regArray##_6, regArray##_7); \ + \ + regArray##_0 = _mm512_unpacklo_epi64(regArray##_8, regArray##_9); \ + regArray##_1 = _mm512_unpackhi_epi64(regArray##_8, regArray##_9); \ + regArray##_2 = _mm512_unpacklo_epi64(regArray##_10, regArray##_11); \ + regArray##_3 = _mm512_unpackhi_epi64(regArray##_10, regArray##_11); \ + regArray##_4 = _mm512_unpacklo_epi64(regArray##_12, regArray##_13); \ + regArray##_5 = _mm512_unpackhi_epi64(regArray##_12, regArray##_13); \ + regArray##_6 = _mm512_unpacklo_epi64(regArray##_14, regArray##_15); \ + regArray##_7 = _mm512_unpackhi_epi64(regArray##_14, regArray##_15); + + +/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11 + |e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11 + |g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15 + |e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15 + |g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11 + |e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 + |e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15 + |e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13 + |e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15 +*/ +#define BF16_INTERLEAVE_8x16(regArray) \ + regArray##_8 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_9 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_10 = _mm256_unpacklo_epi32(regArray##_4, regArray##_5); \ + regArray##_11 = _mm256_unpacklo_epi32(regArray##_6, regArray##_7); \ + regArray##_12 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_13 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \ + regArray##_14 = _mm256_unpackhi_epi32(regArray##_4, regArray##_5); \ + regArray##_15 = _mm256_unpackhi_epi32(regArray##_6, regArray##_7); \ + \ + regArray##_0 = _mm256_unpacklo_epi64(regArray##_8, regArray##_9); \ + regArray##_1 = _mm256_unpackhi_epi64(regArray##_8, regArray##_9); \ + regArray##_2 = _mm256_unpacklo_epi64(regArray##_10, regArray##_11); \ + regArray##_3 = _mm256_unpackhi_epi64(regArray##_10, regArray##_11); \ + regArray##_4 = _mm256_unpacklo_epi64(regArray##_12, regArray##_13); \ + regArray##_5 = _mm256_unpackhi_epi64(regArray##_12, regArray##_13); \ + regArray##_6 = _mm256_unpacklo_epi64(regArray##_14, regArray##_15); \ + regArray##_7 = _mm256_unpackhi_epi64(regArray##_14, regArray##_15); + +/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31 +*/ +#define BF16_INTERLEAVE_4x32(regArray) \ + regArray##_4 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_5 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_6 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_7 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \ + \ + regArray##_0 = _mm512_unpacklo_epi64(regArray##_4, regArray##_5); \ + regArray##_1 = _mm512_unpackhi_epi64(regArray##_4, regArray##_5); \ + regArray##_2 = _mm512_unpacklo_epi64(regArray##_6, regArray##_7); \ + regArray##_3 = _mm512_unpackhi_epi64(regArray##_6, regArray##_7); + + +/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11 + |c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11 + |a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15 + |c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15 + + Step 2: 4-element interleave for matrix + |a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 + |a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11 + |a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13 + |a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15 +*/ +#define BF16_INTERLEAVE_4x16(regArray) \ + regArray##_4 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \ + regArray##_5 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \ + regArray##_6 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \ + regArray##_7 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \ + \ + regArray##_0 = _mm256_unpacklo_epi64(regArray##_4, regArray##_5); \ + regArray##_1 = _mm256_unpackhi_epi64(regArray##_4, regArray##_5); \ + regArray##_2 = _mm256_unpacklo_epi64(regArray##_6, regArray##_7); \ + regArray##_3 = _mm256_unpackhi_epi64(regArray##_6, regArray##_7); + + +/* 2-step interleave for x with 32 BF16 elements + Input - original vector + Output - the output of Step 2 + + Step 1: 2-element interleave for x: + |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11|x16|x17|x16|x17|x18|x19|x18|x19|x24|x25|x24|x25|x26|x27|x26|x27 + |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15|x20|x21|x20|x21|x22|x23|x22|x23|x28|x29|x28|x29|x30|x31|x30|x31 + + Step 2: 4-element interleave for x: + |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 |x16|x17|x16|x17|x16|x17|x16|x17|x24|x25|x24|x25|x24|x25|x24|x25 + |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11|x18|x19|x18|x19|x18|x19|x18|x19|x26|x27|x26|x27|x26|x27|x26|x27 + |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13|x20|x21|x20|x21|x20|x21|x20|x21|x28|x29|x28|x29|x28|x29|x28|x29 + |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15|x22|x23|x22|x23|x22|x23|x22|x23|x30|x31|x30|x31|x30|x31|x30|x31 +*/ +#define BF16_INTERLEAVE_1x32(regArray) \ + regArray##_1 = _mm512_unpacklo_epi32(regArray##_0, regArray##_0); \ + regArray##_3 = _mm512_unpackhi_epi32(regArray##_0, regArray##_0); \ + \ + regArray##_0 = _mm512_unpacklo_epi64(regArray##_1, regArray##_1); \ + regArray##_1 = _mm512_unpackhi_epi64(regArray##_1, regArray##_1); \ + regArray##_2 = _mm512_unpacklo_epi64(regArray##_3, regArray##_3); \ + regArray##_3 = _mm512_unpackhi_epi64(regArray##_3, regArray##_3); + + +/* 2-step interleave for x with 16 BF16 elements + Input - original vector + Output - the output of Step 2 + + Step 1: 2-element interleave for x: + |x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11 + |x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15 + + Step 2: 4-element interleave for x: + |x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 + |x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11 + |x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13 + |x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15 +*/ +#define BF16_INTERLEAVE_1x16(regArray) \ + regArray##_1 = _mm256_unpacklo_epi32(regArray##_0, regArray##_0); \ + regArray##_3 = _mm256_unpackhi_epi32(regArray##_0, regArray##_0); \ + \ + regArray##_0 = _mm256_unpacklo_epi64(regArray##_1, regArray##_1); \ + regArray##_1 = _mm256_unpackhi_epi64(regArray##_1, regArray##_1); \ + regArray##_2 = _mm256_unpacklo_epi64(regArray##_3, regArray##_3); \ + regArray##_3 = _mm256_unpackhi_epi64(regArray##_3, regArray##_3); + +/* 1-step interleave to exchange the high-256s bit and low-256 bits of 4 pair of registers + |a0|a1|...|a14|a15|i0|i1|...|i14|i15| + |b0|b1|...|b14|b15|j0|j1|...|j14|j15| + |c0|c1|...|c14|c15|k0|k1|...|k14|k15| + |d0|d1|...|d14|d15|l0|l1|...|l14|l15| + |e0|e1|...|e14|e15|m0|m1|...|m14|m15| + |f0|f1|...|f14|f15|n0|n1|...|n14|n15| + |g0|g1|...|g14|g15|o0|o1|...|o14|o15| + |h0|h1|...|h14|h15|p0|p1|...|p14|p15| +*/ +#define BF16_INTERLEAVE256_8x32(regArray) \ + regArray##_0 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0x44); \ + regArray##_1 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0xee); \ + regArray##_2 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0x44); \ + regArray##_3 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0xee); \ + regArray##_4 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0x44); \ + regArray##_5 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0xee); \ + regArray##_6 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0x44); \ + regArray##_7 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0xee); + + +/* 1-step interleave to exchange the high-256s bit and low-256 bits of 2 pair of registers + |a0|a1|...|a14|a15|e0|e1|...|e14|e15| + |b0|b1|...|b14|b15|f0|f1|...|f14|f15| + |c0|c1|...|c14|c15|g0|g1|...|g14|g15| + |d0|d1|...|d14|d15|h0|h1|...|h14|h15| +*/ +#define BF16_INTERLEAVE256_4x32(regArray) \ + regArray##_0 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0x44); \ + regArray##_1 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0xee); \ + regArray##_2 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0x44); \ + regArray##_3 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0xee); + + +#define BF16_PERMUTE_8x32(idx, regArray) \ + regArray##_8 = _mm512_permutexvar_epi16(idx, regArray##_0); \ + regArray##_9 = _mm512_permutexvar_epi16(idx, regArray##_1); \ + regArray##_10 = _mm512_permutexvar_epi16(idx, regArray##_2); \ + regArray##_11 = _mm512_permutexvar_epi16(idx, regArray##_3); \ + regArray##_12 = _mm512_permutexvar_epi16(idx, regArray##_4); \ + regArray##_13 = _mm512_permutexvar_epi16(idx, regArray##_5); \ + regArray##_14 = _mm512_permutexvar_epi16(idx, regArray##_6); \ + regArray##_15 = _mm512_permutexvar_epi16(idx, regArray##_7); + + +#define BF16_PERMUTE_8x32_2(idx, regArray) \ + regArray##_8 = _mm512_permutexvar_epi32(idx, regArray##_0); \ + regArray##_9 = _mm512_permutexvar_epi32(idx, regArray##_1); \ + regArray##_10 = _mm512_permutexvar_epi32(idx, regArray##_2); \ + regArray##_11 = _mm512_permutexvar_epi32(idx, regArray##_3); \ + regArray##_12 = _mm512_permutexvar_epi32(idx, regArray##_4); \ + regArray##_13 = _mm512_permutexvar_epi32(idx, regArray##_5); \ + regArray##_14 = _mm512_permutexvar_epi32(idx, regArray##_6); \ + regArray##_15 = _mm512_permutexvar_epi32(idx, regArray##_7); + + +#define BF16_PERMUTE_4x32(idx, regArray) \ + regArray##_4 = _mm512_permutexvar_epi16(idx, regArray##_0); \ + regArray##_5 = _mm512_permutexvar_epi16(idx, regArray##_1); \ + regArray##_6 = _mm512_permutexvar_epi16(idx, regArray##_2); \ + regArray##_7 = _mm512_permutexvar_epi16(idx, regArray##_3); + + +#define BF16_PERMUTE_4x32_2(idx, regArray) \ + regArray##_4 = _mm512_permutexvar_epi32(idx, regArray##_0); \ + regArray##_5 = _mm512_permutexvar_epi32(idx, regArray##_1); \ + regArray##_6 = _mm512_permutexvar_epi32(idx, regArray##_2); \ + regArray##_7 = _mm512_permutexvar_epi32(idx, regArray##_3); + + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_8x32(accumArray, matArray, xArray) \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_2, (__m512bh) xArray##_0); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_1); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_4, (__m512bh) xArray##_2); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_6, (__m512bh) xArray##_2); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_5, (__m512bh) xArray##_3); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_7, (__m512bh) xArray##_3); + + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_8x16(accumArray, matArray, xArray) \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_2, (__m256bh) xArray##_0); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_1); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_4, (__m256bh) xArray##_2); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_6, (__m256bh) xArray##_2); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_5, (__m256bh) xArray##_3); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_7, (__m256bh) xArray##_3); + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_4x32(accumArray, matArray, xArray) \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_2, (__m512bh) xArray##_2); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_3); + + +/* Calculate the dot result for 2-step interleaved matrix and vector + (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_2STEP_INTERLEAVED_DOT_4x16(accumArray, matArray, xArray) \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_2, (__m256bh) xArray##_2); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_3); + + +/* Calculate the dot result for matrix and vector at 32 elements per row + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_DOT_8x32(accumArray, matArray, xArray) \ + accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray); \ + accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray); \ + accumArray##_2 = _mm512_dpbf16_ps(accumArray##_2, (__m512bh) matArray##_2, (__m512bh) xArray); \ + accumArray##_3 = _mm512_dpbf16_ps(accumArray##_3, (__m512bh) matArray##_3, (__m512bh) xArray); \ + accumArray##_4 = _mm512_dpbf16_ps(accumArray##_4, (__m512bh) matArray##_4, (__m512bh) xArray); \ + accumArray##_5 = _mm512_dpbf16_ps(accumArray##_5, (__m512bh) matArray##_5, (__m512bh) xArray); \ + accumArray##_6 = _mm512_dpbf16_ps(accumArray##_6, (__m512bh) matArray##_6, (__m512bh) xArray); \ + accumArray##_7 = _mm512_dpbf16_ps(accumArray##_7, (__m512bh) matArray##_7, (__m512bh) xArray); + +/* Calculate the dot result for matrix and vector at 32 elements per row + (Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_DOT_1x32(accumArray, matArray, xArray) \ + accumArray = _mm512_dpbf16_ps(accumArray, (__m512bh) matArray, (__m512bh) xArray); + +/* Calculate the dot result for matrix and vector at 16 elements per row + (Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform) +*/ +#define BF16_DOT_8x16(accumArray, matArray, xArray) \ + accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray); \ + accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray); \ + accumArray##_2 = _mm256_dpbf16_ps(accumArray##_2, (__m256bh) matArray##_2, (__m256bh) xArray); \ + accumArray##_3 = _mm256_dpbf16_ps(accumArray##_3, (__m256bh) matArray##_3, (__m256bh) xArray); \ + accumArray##_4 = _mm256_dpbf16_ps(accumArray##_4, (__m256bh) matArray##_4, (__m256bh) xArray); \ + accumArray##_5 = _mm256_dpbf16_ps(accumArray##_5, (__m256bh) matArray##_5, (__m256bh) xArray); \ + accumArray##_6 = _mm256_dpbf16_ps(accumArray##_6, (__m256bh) matArray##_6, (__m256bh) xArray); \ + accumArray##_7 = _mm256_dpbf16_ps(accumArray##_7, (__m256bh) matArray##_7, (__m256bh) xArray); + + +/* 2-step interleave for matrix against 8 rows with 16 fp32 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|b0|a1|b1|a4|b4|a5|b5|a8 |b8 |a9 |b9 |a12|b12|a13|b13| + |c0|d0|c1|d1|c4|d4|c5|d5|c8 |d8 |c9 |d9 |c12|d12|c13|d13| + |e0|f0|e1|f1|e4|f4|e5|f5|e8 |f8 |e9 |f9 |e12|f12|e13|f13| + |g0|h0|g1|h1|g4|h4|g5|h5|g8 |h8 |g9 |h9 |g12|h12|g13|h13| + |a2|b2|a3|b3|a6|b6|a7|b7|a10|b10|a11|b11|a14|b14|a15|b15| + |c2|d2|c3|d3|c6|d6|c7|d7|c10|d10|c11|d11|c14|d14|c15|d15| + |e2|f2|e3|f3|e6|f6|e7|f7|e10|f10|e11|f11|e14|f14|e15|f15| + |g2|h2|g3|h3|g6|h6|g7|h7|g10|h10|g11|h11|g14|h14|g15|h15| + + Step 2: 4-element interleave for matrix + |a0|b0|c0|d0|a4|b4|c4|d4|a8 |b8 |c8 |d8 |a12|b12|c12|d12| + |a1|b1|c1|d1|a5|b5|c5|d5|a9 |b9 |c9 |d9 |a13|b13|c13|d13| + |e0|f0|g0|h0|e4|f4|g4|h4|e8 |f8 |g8 |h8 |e12|f12|g12|h12| + |e1|f1|g1|h1|e5|f5|g5|h5|e9 |f9 |g9 |h9 |e13|f13|g13|h13| + |a2|b2|c2|d2|a6|b6|c6|d6|a10|b10|c10|d10|a14|b14|c14|d14| + |a3|b3|c3|d3|a7|b7|c7|d7|a11|b11|c11|d11|a15|b15|c15|d15| + |e2|f2|g2|h2|e6|f6|g6|h6|e10|f10|g10|h10|e14|f14|g14|h14| + |e3|f3|g3|h3|e7|f7|g7|h7|e11|f11|g11|h11|e15|f15|g15|h15| +*/ +#define FP32_INTERLEAVE_8x16(regArray) \ + regArray##_8 = _mm512_unpacklo_ps(regArray##_0, regArray##_1); \ + regArray##_9 = _mm512_unpacklo_ps(regArray##_2, regArray##_3); \ + regArray##_10 = _mm512_unpacklo_ps(regArray##_4, regArray##_5); \ + regArray##_11 = _mm512_unpacklo_ps(regArray##_6, regArray##_7); \ + regArray##_12 = _mm512_unpackhi_ps(regArray##_0, regArray##_1); \ + regArray##_13 = _mm512_unpackhi_ps(regArray##_2, regArray##_3); \ + regArray##_14 = _mm512_unpackhi_ps(regArray##_4, regArray##_5); \ + regArray##_15 = _mm512_unpackhi_ps(regArray##_6, regArray##_7); \ + \ + regArray##_0 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \ + regArray##_1 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \ + regArray##_4 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \ + regArray##_5 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \ + regArray##_2 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \ + regArray##_3 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \ + regArray##_6 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_14, (__m512d) regArray##_15); \ + regArray##_7 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_14, (__m512d) regArray##_15); + +#define FP32_INTERLEAVE_8x16_ARRAY(regArray) \ + regArray[8] = _mm512_unpacklo_ps(regArray[0], regArray[1]); \ + regArray[9] = _mm512_unpacklo_ps(regArray[2], regArray[3]); \ + regArray[10] = _mm512_unpacklo_ps(regArray[4], regArray[5]); \ + regArray[11] = _mm512_unpacklo_ps(regArray[6], regArray[7]); \ + regArray[12] = _mm512_unpackhi_ps(regArray[0], regArray[1]); \ + regArray[13] = _mm512_unpackhi_ps(regArray[2], regArray[3]); \ + regArray[14] = _mm512_unpackhi_ps(regArray[4], regArray[5]); \ + regArray[15] = _mm512_unpackhi_ps(regArray[6], regArray[7]); \ + \ + regArray[0] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[8], (__m512d) regArray[9]); \ + regArray[1] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[8], (__m512d) regArray[9]); \ + regArray[4] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[10], (__m512d) regArray[11]); \ + regArray[5] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[10], (__m512d) regArray[11]); \ + regArray[2] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[12], (__m512d) regArray[13]); \ + regArray[3] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[12], (__m512d) regArray[13]); \ + regArray[6] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[14], (__m512d) regArray[15]); \ + regArray[7] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[14], (__m512d) regArray[15]); + +/* 2-step interleave for matrix against 8 rows with 8 fp32 elements per row + Input - register array of 8 rows of raw-major matrix + Output - the output of Step 2 + + Step 1: 2-element interleave for matrix + |a0|b0|a1|b1|a4|b4|a5|b5| + |c0|d0|c1|d1|c4|d4|c5|d5| + |e0|f0|e1|f1|e4|f4|e5|f5| + |g0|h0|g1|h1|g4|h4|g5|h5| + |a2|b2|a3|b3|a6|b6|a7|b7| + |c2|d2|c3|d3|c6|d6|c7|d7| + |e2|f2|e3|f3|e6|f6|e7|f7| + |g2|h2|g3|h3|g6|h6|g7|h7| + + Step 2: 4-element interleave for matrix + |a0|b0|c0|d0|a4|b4|c4|d4| + |a1|b1|c1|d1|a5|b5|c5|d5| + |e0|f0|g0|h0|e4|f4|g4|h4| + |e1|f1|g1|h1|e5|f5|g5|h5| + |a2|b2|c2|d2|a6|b6|c6|d6| + |a3|b3|c3|d3|a7|b7|c7|d7| + |e2|f2|g2|h2|e6|f6|g6|h6| + |e3|f3|g3|h3|e7|f7|g7|h7| +*/ +#define FP32_INTERLEAVE_8x8(regArray) \ + regArray##_8 = _mm256_unpacklo_ps(regArray##_0, regArray##_1); \ + regArray##_9 = _mm256_unpacklo_ps(regArray##_2, regArray##_3); \ + regArray##_10 = _mm256_unpacklo_ps(regArray##_4, regArray##_5); \ + regArray##_11 = _mm256_unpacklo_ps(regArray##_6, regArray##_7); \ + regArray##_12 = _mm256_unpackhi_ps(regArray##_0, regArray##_1); \ + regArray##_13 = _mm256_unpackhi_ps(regArray##_2, regArray##_3); \ + regArray##_14 = _mm256_unpackhi_ps(regArray##_4, regArray##_5); \ + regArray##_15 = _mm256_unpackhi_ps(regArray##_6, regArray##_7); \ + \ + regArray##_0 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \ + regArray##_1 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \ + regArray##_4 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \ + regArray##_5 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \ + regArray##_2 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \ + regArray##_3 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \ + regArray##_6 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_14, (__m256d) regArray##_15); \ + regArray##_7 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_14, (__m256d) regArray##_15); + + +/* Accumulate the result for 2 batch of 4-registers +*/ +#define FP32_ACCUM2_8x16(regArray) \ + regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_1); \ + regArray##_2 = _mm512_add_ps(regArray##_2, regArray##_3); \ + regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_5); \ + regArray##_6 = _mm512_add_ps(regArray##_6, regArray##_7); \ + regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_2); \ + regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_6); + +#define FP32_ACCUM2_8x16_ARRAY(regArray) \ + regArray[0] = _mm512_add_ps(regArray[0], regArray[1]); \ + regArray[2] = _mm512_add_ps(regArray[2], regArray[3]); \ + regArray[4] = _mm512_add_ps(regArray[4], regArray[5]); \ + regArray[6] = _mm512_add_ps(regArray[6], regArray[7]); \ + regArray[0] = _mm512_add_ps(regArray[0], regArray[2]); \ + regArray[4] = _mm512_add_ps(regArray[4], regArray[6]); + +/* Accumulate the result for 2 batch of 4-registers +*/ +#define FP32_ACCUM2_8x8(regArray) \ + regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_1); \ + regArray##_2 = _mm256_add_ps(regArray##_2, regArray##_3); \ + regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_5); \ + regArray##_6 = _mm256_add_ps(regArray##_6, regArray##_7); \ + regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_2); \ + regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_6); + + +/* Store 16 (alpha * result + beta * y) to y +*/ +#define STORE16_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_loadu_ps(targetAddr))); \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 (alpha * result + beta * y) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_maskz_loadu_ps(mask, targetAddr))); \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 (alpha * result + beta * y) to y +*/ +#define STORE8_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_loadu_ps(targetAddr))); \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 (alpha * result + beta * y) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_maskz_loadu_ps(mask, targetAddr))); \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 (alpha * result + beta * y) to y +*/ +#define STORE4_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_loadu_ps(targetAddr))); \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 (alpha * result + beta * y) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_maskz_loadu_ps(mask, targetAddr))); \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 16 (alpha * result + y) to y +*/ +#define STORE16_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_loadu_ps(targetAddr)); \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 (alpha * result + y) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \ + regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_maskz_loadu_ps(mask, targetAddr)); \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 (alpha * result + y) to y +*/ +#define STORE8_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_loadu_ps(targetAddr)); \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 (alpha * result + y) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \ + regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_maskz_loadu_ps(mask, targetAddr)); \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 (alpha * result + y) to y +*/ +#define STORE4_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_loadu_ps(targetAddr)); \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 (alpha * result + y) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \ + regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_maskz_loadu_ps(mask, targetAddr)); \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 16 (alpha * result) to y +*/ +#define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ + _mm512_storeu_ps(targetAddr, _mm512_mul_ps(ALPHAVECTOR, regResult)); + + +/* Masked store 16 (alpha * result) to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \ + _mm512_mask_storeu_ps(targetAddr, mask, _mm512_mul_ps(ALPHAVECTOR, regResult)); + + +/* Store 8 (alpha * result) to y +*/ +#define STORE8_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ + _mm256_storeu_ps(targetAddr, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult)); + + +/* Masked store 8 (alpha * result) to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \ + _mm256_mask_storeu_ps(targetAddr, mask, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult)); + + +/* Store 4 (alpha * result) to y +*/ +#define STORE4_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \ + _mm_storeu_ps(targetAddr, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult)); + + +/* Masked store 4 (alpha * result) to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \ + _mm_mask_storeu_ps(targetAddr, mask, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult)); + + +/* Store 16 result to y +*/ +#define STORE16_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \ + _mm512_storeu_ps(targetAddr, regResult); + + +/* Masked store 16 result to y +*/ +#define STORE16_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \ + _mm512_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 8 result to y +*/ +#define STORE8_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \ + _mm256_storeu_ps(targetAddr, regResult); + + +/* Masked store 8 result to y +*/ +#define STORE8_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \ + _mm256_mask_storeu_ps(targetAddr, mask, regResult); + + +/* Store 4 result to y +*/ +#define STORE4_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \ + _mm_storeu_ps(targetAddr, regResult); + + +/* Masked store 4 result to y +*/ +#define STORE4_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \ + _mm_mask_storeu_ps(targetAddr, mask, regResult); + +#endif diff --git a/kernel/x86_64/sbgemv_n.c b/kernel/x86_64/sbgemv_n.c new file mode 100644 index 000000000..18e64dc3f --- /dev/null +++ b/kernel/x86_64/sbgemv_n.c @@ -0,0 +1,137 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined (COOPERLAKE) +#include "sbgemv_n_microk_cooperlake.c" +#endif + +#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ + ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ + ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr + +#define ALIGN64_FREE(ptr) \ + free(ptr) + +#ifndef HAVE_SBGEMV_N_ACCL_KERNEL +static void sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + BLASLONG offset_lda, offset_m; + float accum = 0.0; + float tmp_x = 0.0; + + bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n); + float * a_fp32 = malloc(sizeof(float)*m*n); + float * x_fp32 = malloc(sizeof(float)*n); + + for (BLASLONG j=0; j= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SBGEMV_N_ACCL_KERNEL 1 +#include "common.h" +#include + +// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios +#undef ZERO_BETA +#undef ONE_BETA +#undef ONE_ALPHA +#include "sbgemv_n_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios +#undef ZERO_BETA +#define ONE_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_n_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_n_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#define ONE_ALPHA 1 +#include "sbgemv_n_microk_cooperlake_template.c" + +static int sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data + if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA + sbgemv_kernel_32xN_lda_direct(m, n, alpha, a, lda, x, y); + } else { // ALPHA != 1.0, need to multipy ALPHA + sbgemv_kernel_32xN_lda_direct_alpha(m, n, alpha, a, lda, x, y); + } + } else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is + if (beta == ONE) { + sbgemv_kernel_32xN_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else { + sbgemv_kernel_32xN_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } + } + + return 0; +} + +#endif diff --git a/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c new file mode 100644 index 000000000..46e6d0ff9 --- /dev/null +++ b/kernel/x86_64/sbgemv_n_microk_cooperlake_template.c @@ -0,0 +1,234 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" + +// Include common macros for BF16 based operations with IA intrinsics +#include "bf16_common_macros.h" + +#ifndef ZERO_BETA // Beta is non-zero + +#ifndef ONE_BETA // BETA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_BETA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA + +#else // BETA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE + +#endif + +#else // BETA is zero + +#ifndef ONE_ALPHA // ALPHA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA + +#else // ALPHA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_DIRECT +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_DIRECT +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_DIRECT +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_DIRECT +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_DIRECT +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_DIRECT + +#endif + +#endif + + + +// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave) +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32xN_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32xN_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32xN_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + BLASLONG tag_m_128x = m & (~127); + + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ + accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3; + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512i xArray_0; + + __m512i ZERO512 = _mm512_setzero_si512(); + + unsigned int blend_hi_mask_value = ((unsigned int)0xaaaaaaaa); + __mmask32 blend_hi_mask = *((__mmask32*) &blend_hi_mask_value); + unsigned int blend_lo_mask_value = ((unsigned int)0x55555555); + __mmask32 blend_lo_mask = *((__mmask32*) &blend_lo_mask_value); + + __m512i M512_EPI32_8 = _mm512_set1_epi32(8); + __m512i idx_base_0 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_8); + + for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_epi16(x[idx_n]); + + BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m + 0) + BF16_MATRIX_LOAD_1x32(matrixArray_seed_1, a, lda, idx_n, idx_m + 32) + BF16_MATRIX_LOAD_1x32(matrixArray_seed_2, a, lda, idx_n, idx_m + 64) + BF16_MATRIX_LOAD_1x32(matrixArray_seed_3, a, lda, idx_n, idx_m + 96) + + matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0); + matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0); + matrixArray_2 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_1); + matrixArray_3 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_1); + matrixArray_4 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_2); + matrixArray_5 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_2); + matrixArray_6 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_3); + matrixArray_7 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_3); + + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0) + BF16_DOT_1x32(accum512_2, matrixArray_2, xArray_0) + BF16_DOT_1x32(accum512_3, matrixArray_3, xArray_0) + BF16_DOT_1x32(accum512_4, matrixArray_4, xArray_0) + BF16_DOT_1x32(accum512_5, matrixArray_5, xArray_0) + BF16_DOT_1x32(accum512_6, matrixArray_6, xArray_0) + BF16_DOT_1x32(accum512_7, matrixArray_7, xArray_0) + } + accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + accum512_10 = _mm512_permutex2var_ps(accum512_2, idx_base_0, accum512_3); + accum512_11 = _mm512_permutex2var_ps(accum512_2, idx_base_1, accum512_3); + accum512_12 = _mm512_permutex2var_ps(accum512_4, idx_base_0, accum512_5); + accum512_13 = _mm512_permutex2var_ps(accum512_4, idx_base_1, accum512_5); + accum512_14 = _mm512_permutex2var_ps(accum512_6, idx_base_0, accum512_7); + accum512_15 = _mm512_permutex2var_ps(accum512_6, idx_base_1, accum512_7); + + STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0) + STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16) + STORE16_COMPLETE_RESULT(accum512_10, y+idx_m+32) + STORE16_COMPLETE_RESULT(accum512_11, y+idx_m+48) + STORE16_COMPLETE_RESULT(accum512_12, y+idx_m+64) + STORE16_COMPLETE_RESULT(accum512_13, y+idx_m+80) + STORE16_COMPLETE_RESULT(accum512_14, y+idx_m+96) + STORE16_COMPLETE_RESULT(accum512_15, y+idx_m+112) + } + + for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_32x; idx_m+=32) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_epi16(x[idx_n]); + + BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m) + + matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0); + matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0); + + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0) + } + accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + + STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0) + STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16) + } + + if (tag_m_32x != m) { + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15))); + __mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value); + + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < n; idx_n++) { + xArray_0 = _mm512_set1_epi16(x[idx_n]); + + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, tag_m_32x, tail_mask) + + matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0); + matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0); + + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0) + } + accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + + if ((m-tag_m_32x) > 16) { + STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0) + STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask) + } else { + STORE16_MASK_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0, store_tail_mask) + } + } + + return 0; +} diff --git a/kernel/x86_64/sbgemv_t.c b/kernel/x86_64/sbgemv_t.c new file mode 100644 index 000000000..22b099116 --- /dev/null +++ b/kernel/x86_64/sbgemv_t.c @@ -0,0 +1,142 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +#if defined (COOPERLAKE) +#include "sbgemv_t_microk_cooperlake.c" +#endif + +#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \ + ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \ + ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr + +#define ALIGN64_FREE(ptr) \ + free(ptr) + +#ifndef HAVE_SBGEMV_T_ACCL_KERNEL +static void sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + BLASLONG offset_lda, offset_n; + float accum = 0.0; + + bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n); + float * a_fp32 = malloc(sizeof(float)*m*n); + float * x_fp32 = malloc(sizeof(float)*n); + + for (BLASLONG i=0; i= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9)) + +#define HAVE_SBGEMV_T_ACCL_KERNEL 1 + +// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios +#undef ZERO_BETA +#undef ONE_BETA +#undef ONE_ALPHA +#include "sbgemv_t_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios +#undef ZERO_BETA +#define ONE_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_t_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#undef ONE_ALPHA +#include "sbgemv_t_microk_cooperlake_template.c" + +// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios +#define ZERO_BETA 1 +#define ONE_ALPHA 1 +#include "sbgemv_t_microk_cooperlake_template.c" + +static int sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +{ + if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data + if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA + if (n > 127) { + sbgemv_kernel_1x128_lda_direct(m, n, alpha, a, lda, x, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct(m, n, alpha, a, lda, x, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda(m, n, alpha, a, lda, x, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1 (m, alpha, a, x, y); break; + case 2: sbgemv_kernel_32x2 (m, alpha, a, x, y); break; + case 3: sbgemv_kernel_32x3 (m, alpha, a, x, y); break; + case 4: sbgemv_kernel_16x4 (m, alpha, a, x, y); break; + case 5: sbgemv_kernel_30x5 (m, alpha, a, x, y); break; + case 6: sbgemv_kernel_16x6 (m, alpha, a, x, y); break; + case 7: sbgemv_kernel_16x7 (m, alpha, a, x, y); break; + case 8: sbgemv_kernel_16x8 (m, alpha, a, x, y); break; + case 9: sbgemv_kernel_14x9 (m, alpha, a, x, y); break; + case 10: sbgemv_kernel_12x10(m, alpha, a, x, y); break; + case 11: sbgemv_kernel_15x11(m, alpha, a, x, y); break; + case 12: sbgemv_kernel_15x12(m, alpha, a, x, y); break; + case 13: sbgemv_kernel_16x13(m, alpha, a, x, y); break; + case 14: sbgemv_kernel_16x14(m, alpha, a, x, y); break; + case 15: sbgemv_kernel_16x15(m, alpha, a, x, y); break; + case 16: sbgemv_kernel_16x16(m, alpha, a, x, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda(m, n, alpha, a, lda, x, y); + } + } + } + } else { // ALPHA != 1.0, need to multipy ALPHA + if (n > 127) { + sbgemv_kernel_1x128_lda_direct_alpha(m, n, alpha, a, lda, x, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct_alpha(m, n, alpha, a, lda, x, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda_alpha(m, n, alpha, a, lda, x, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1_alpha (m, alpha, a, x, y); break; + case 2: sbgemv_kernel_32x2_alpha (m, alpha, a, x, y); break; + case 3: sbgemv_kernel_32x3_alpha (m, alpha, a, x, y); break; + case 4: sbgemv_kernel_16x4_alpha (m, alpha, a, x, y); break; + case 5: sbgemv_kernel_30x5_alpha (m, alpha, a, x, y); break; + case 6: sbgemv_kernel_16x6_alpha (m, alpha, a, x, y); break; + case 7: sbgemv_kernel_16x7_alpha (m, alpha, a, x, y); break; + case 8: sbgemv_kernel_16x8_alpha (m, alpha, a, x, y); break; + case 9: sbgemv_kernel_14x9_alpha (m, alpha, a, x, y); break; + case 10: sbgemv_kernel_12x10_alpha(m, alpha, a, x, y); break; + case 11: sbgemv_kernel_15x11_alpha(m, alpha, a, x, y); break; + case 12: sbgemv_kernel_15x12_alpha(m, alpha, a, x, y); break; + case 13: sbgemv_kernel_16x13_alpha(m, alpha, a, x, y); break; + case 14: sbgemv_kernel_16x14_alpha(m, alpha, a, x, y); break; + case 15: sbgemv_kernel_16x15_alpha(m, alpha, a, x, y); break; + case 16: sbgemv_kernel_16x16_alpha(m, alpha, a, x, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda_alpha(m, n, alpha, a, lda, x, y); + } + } + } + } + } else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is + if (beta == ONE) { + if (n > 127) { + sbgemv_kernel_1x128_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda_alpha_one(m, n, alpha, a, lda, x, beta, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1_alpha_one (m, alpha, a, x, beta, y); break; + case 2: sbgemv_kernel_32x2_alpha_one (m, alpha, a, x, beta, y); break; + case 3: sbgemv_kernel_32x3_alpha_one (m, alpha, a, x, beta, y); break; + case 4: sbgemv_kernel_16x4_alpha_one (m, alpha, a, x, beta, y); break; + case 5: sbgemv_kernel_30x5_alpha_one (m, alpha, a, x, beta, y); break; + case 6: sbgemv_kernel_16x6_alpha_one (m, alpha, a, x, beta, y); break; + case 7: sbgemv_kernel_16x7_alpha_one (m, alpha, a, x, beta, y); break; + case 8: sbgemv_kernel_16x8_alpha_one (m, alpha, a, x, beta, y); break; + case 9: sbgemv_kernel_14x9_alpha_one (m, alpha, a, x, beta, y); break; + case 10: sbgemv_kernel_12x10_alpha_one(m, alpha, a, x, beta, y); break; + case 11: sbgemv_kernel_15x11_alpha_one(m, alpha, a, x, beta, y); break; + case 12: sbgemv_kernel_15x12_alpha_one(m, alpha, a, x, beta, y); break; + case 13: sbgemv_kernel_16x13_alpha_one(m, alpha, a, x, beta, y); break; + case 14: sbgemv_kernel_16x14_alpha_one(m, alpha, a, x, beta, y); break; + case 15: sbgemv_kernel_16x15_alpha_one(m, alpha, a, x, beta, y); break; + case 16: sbgemv_kernel_16x16_alpha_one(m, alpha, a, x, beta, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda_alpha_one(m, n, alpha, a, lda, x, beta, y); + } + } + } + } else { + if (n > 127) { + sbgemv_kernel_1x128_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } else if (n > 32) { + sbgemv_kernel_8x32_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } else { + if (n > 16) { + sbgemv_kernel_8x16p_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } else { + if (lda == n) { + switch(n) { + case 1: sbgemv_kernel_32x1_alpha_beta (m, alpha, a, x, beta, y); break; + case 2: sbgemv_kernel_32x2_alpha_beta (m, alpha, a, x, beta, y); break; + case 3: sbgemv_kernel_32x3_alpha_beta (m, alpha, a, x, beta, y); break; + case 4: sbgemv_kernel_16x4_alpha_beta (m, alpha, a, x, beta, y); break; + case 5: sbgemv_kernel_30x5_alpha_beta (m, alpha, a, x, beta, y); break; + case 6: sbgemv_kernel_16x6_alpha_beta (m, alpha, a, x, beta, y); break; + case 7: sbgemv_kernel_16x7_alpha_beta (m, alpha, a, x, beta, y); break; + case 8: sbgemv_kernel_16x8_alpha_beta (m, alpha, a, x, beta, y); break; + case 9: sbgemv_kernel_14x9_alpha_beta (m, alpha, a, x, beta, y); break; + case 10: sbgemv_kernel_12x10_alpha_beta(m, alpha, a, x, beta, y); break; + case 11: sbgemv_kernel_15x11_alpha_beta(m, alpha, a, x, beta, y); break; + case 12: sbgemv_kernel_15x12_alpha_beta(m, alpha, a, x, beta, y); break; + case 13: sbgemv_kernel_16x13_alpha_beta(m, alpha, a, x, beta, y); break; + case 14: sbgemv_kernel_16x14_alpha_beta(m, alpha, a, x, beta, y); break; + case 15: sbgemv_kernel_16x15_alpha_beta(m, alpha, a, x, beta, y); break; + case 16: sbgemv_kernel_16x16_alpha_beta(m, alpha, a, x, beta, y); break; + default: break; + } + } else { + sbgemv_kernel_8x16m_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y); + } + } + } + } + } + + return 0; +} + +#endif diff --git a/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c new file mode 100644 index 000000000..51e681add --- /dev/null +++ b/kernel/x86_64/sbgemv_t_microk_cooperlake_template.c @@ -0,0 +1,3082 @@ +/*************************************************************************** +Copyright (c) 2014, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ +#include +#include "common.h" +// Include common macros for BF16 based operations with IA intrinsics +#include "bf16_common_macros.h" + +#ifndef ZERO_BETA // Beta is non-zero + +#ifndef ONE_BETA // BETA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_BETA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_BETA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_BETA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA + +#else // BETA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_ONE +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_ONE +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE + +#endif + +#else // BETA is zero + +#ifndef ONE_ALPHA // ALPHA is not ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA + +#else // ALPHA is ONE + +#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_DIRECT +#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_DIRECT +#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_DIRECT +#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_DIRECT +#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_DIRECT +#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_DIRECT + +#endif + +#endif + + +// 32 rows parallel processing BF16 GEMV kernel for n=1 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32x1_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32x1_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32x1_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32x1(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + + __m512i matrixArray_0, matrixArray_1, matrixArray_2; + __m512i xArray; + __m512 result_0, result_1; +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA +#ifndef ONE_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif +#endif + + __m512i load_idx_lo = _mm512_set_epi16(0, 15, 0, 14, 0, 13, 0, 12, 0, 11, 0, 10, 0, 9, 0, 8,\ + 0, 7, 0, 6, 0, 5, 0, 4, 0, 3, 0, 2, 0, 1, 0, 0); + __m512i M512_EPI16_16 = _mm512_set1_epi16(16); + __m512i load_idx_hi = _mm512_add_epi16(load_idx_lo, M512_EPI16_16); + + unsigned int interleve_mask_value = ((unsigned int) 0x55555555); + __mmask32 interleave_mask = *((__mmask32*) &interleve_mask_value); + + xArray = _mm512_set1_epi16((short) x[0]); + xArray = _mm512_mask_blend_epi16(interleave_mask, _mm512_setzero_si512(), xArray); + + if (tag_m_32x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)]); // Load 32 rows with n=1 + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0); // Expand the low 16 elements + matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0); // Expand the high 16 elements + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_COMPLETE_RESULT(result_1, y+idx_m+16) + } + } + + BLASLONG tail_num = m - tag_m_32x; + if (tail_num > 16) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-tail_num)); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 32 rows with n=1 + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_lo, matrixArray_0); // Expand the low 16 elements + matrixArray_2 = _mm512_permutexvar_epi16(load_idx_hi, matrixArray_0); // Expand the high 16 elements + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_1, (__m512bh) xArray); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_2, (__m512bh) xArray); + + unsigned short store_mask_value = (((unsigned short)0xffff) >> (32-tail_num)); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x) + STORE16_MASK_COMPLETE_RESULT(result_1, y+tag_m_32x+16, store_mask) + } else if (tail_num > 8) { + __m256 result256_0 = _mm256_setzero_ps(); + __m256 result256_1 = _mm256_setzero_ps(); + + __m256i load_idx_lo256 = _mm512_castsi512_si256(load_idx_lo); + __m256i load_idx_hi256 = _mm512_extracti32x8_epi32(load_idx_lo, 0x1); + __m256i xArray256 = _mm512_castsi512_si256(xArray); + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + __m256i matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 16 rows with n=1 + __m256i matrixArray256_1 = _mm256_permutexvar_epi16(load_idx_lo256, matrixArray256_0); // Expand the low 8 elements + __m256i matrixArray256_2 = _mm256_permutexvar_epi16(load_idx_hi256, matrixArray256_0); // Expand the high 8 elements + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_1, (__m256bh) xArray256); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_2, (__m256bh) xArray256); + + unsigned char store_mask_value = (((unsigned char)0xff) >> (16-tail_num)); + __mmask8 store_mask = *((__mmask8*) &store_mask_value); + STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x) + STORE8_MASK_COMPLETE_RESULT(result256_1, y+tag_m_32x+8, store_mask) + } else { + __m128 result128_0 = _mm_setzero_ps(); + __m128 result128_1 = _mm_setzero_ps(); + + __m128i load_idx_lo128 = _mm_set_epi16(0, 3, 0, 2, 0, 1, 0, 0); + __m128i M128_EPI16_4 = _mm_set1_epi16(4); + __m128i load_idx_hi128 = _mm_add_epi16(load_idx_lo128, M128_EPI16_4); + + __m128i xArray128 = _mm512_castsi512_si128(xArray); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + __m128i matrixArray128_0 = _mm_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)]); // Load 8 rows with n=1 + __m128i matrixArray128_1 = _mm_permutexvar_epi16(load_idx_lo128, matrixArray128_0); // Expand the low 4 elements + __m128i matrixArray128_2 = _mm_permutexvar_epi16(load_idx_hi128, matrixArray128_0); // Expand the high 4 elements + + result128_0 = _mm_dpbf16_ps(result128_0, (__m128bh) matrixArray128_1, (__m128bh) xArray128); + result128_1 = _mm_dpbf16_ps(result128_1, (__m128bh) matrixArray128_2, (__m128bh) xArray128); + + if (tail_num > 4) { + unsigned char store_mask_value = (((unsigned char)0xf) >> (8-tail_num)); + __mmask8 store_mask = *((__mmask8*) &store_mask_value); + STORE4_COMPLETE_RESULT(result128_0, y+tag_m_32x) + STORE4_MASK_COMPLETE_RESULT(result128_1, y+tag_m_32x+4, store_mask) + } else { + unsigned char store_mask_value = (((unsigned char)0xf) >> (4-tail_num)); + __mmask8 store_mask = *((__mmask8*) &store_mask_value); + STORE4_MASK_COMPLETE_RESULT(result128_0, y+tag_m_32x, store_mask) + } + } + + return 0; +} + +// 32 rows parallel processing BF16 GEMV kernel for n=2 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32x2_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32x2_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32x2_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32x2(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + + __m512i matrixArray_0, matrixArray_1; + __m512i xArray; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + unsigned char load_mask_value = (((unsigned char)0xff) >> 6); + __mmask8 load_mask = *((__mmask8*) &load_mask_value); + xArray = _mm512_broadcastd_epi32(_mm_maskz_loadu_epi16(load_mask, x)); + + if (tag_m_32x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*2]); // Load 16 rows as n=2 + matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+16)*2]); // Load 16 rows as n=2 + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_COMPLETE_RESULT(result_1, y+idx_m+16) + } + } + + if (m - tag_m_32x >= 16) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_32x)*2]); // Load 16 rows with n=2 + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray); + + STORE16_COMPLETE_RESULT(result_0, y+tag_m_32x) + + tag_m_32x += 16; + } + + BLASLONG tail_num = m - tag_m_32x; + if (tail_num > 8) { + result_0 = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(m&15))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]); // Load 16 rows with n=2 + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_32x, tail_mask) + } else if (tail_num == 8) { + __m256 result256 = _mm256_setzero_ps(); + + __m256i matrixArray256 = _mm256_loadu_si256(&a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i xArray256 = _mm512_castsi512_si256(xArray); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_32x) + } else { + __m256 result256 = _mm256_setzero_ps(); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-(m&7))); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + __m256i matrixArray256 = _mm256_maskz_loadu_epi32(tail_mask, &a[(tag_m_32x)*2]); // Load 8 rows with n=2 + __m256i xArray256 = _mm512_castsi512_si256(xArray); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) xArray256); + + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_32x, tail_mask) + } + + return 0; +} + +// 32 rows parallel processing BF16 GEMV kernel for n=3 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_32x3_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_32x3_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_32x3_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_32x3(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_32x = m & (~31); + + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i xTmp = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|0|0|0|0|0| + __m512i xArray_0 = _mm512_broadcastd_epi32(xTmp); // x0|x1|x0|x1|...|x0|x1| + __m512i xArray_1 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1)); // x2| 0|x2| 0|...|x2| 0| + + __m512i load_idx_base; + __m512i M512_EPI16_2, M512_EPI16_8, M512_EPI16_16; + M512_EPI16_2 = _mm512_set1_epi16(2); + M512_EPI16_8 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2); + M512_EPI16_8 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8); + M512_EPI16_16 = _mm512_add_epi16(M512_EPI16_8, M512_EPI16_8); + load_idx_base = _mm512_set_epi16(46, 45, 43, 42, 40, 39, 37, 36, 34, 33, 31, 30, 28, 27, 25, 24, + 22, 21, 19, 18, 16, 15, 13, 12, 10, 9, 7, 6, 4, 3, 1, 0); + + if (tag_m_32x > 0) { + __m512i load_idx01_1st, load_idx01_2nd, load_idx2_1st, load_idx2_2nd; + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6; + + unsigned int idx_blend_mask_value = ((unsigned int)0x80000000); + __mmask32 idx_blend_mask = *((__mmask32*) &idx_blend_mask_value); + + load_idx01_1st = load_idx_base; + load_idx01_2nd = _mm512_add_epi16(load_idx01_1st, M512_EPI16_16); + load_idx2_1st = _mm512_add_epi16(load_idx01_1st, M512_EPI16_2); + load_idx2_2nd = _mm512_add_epi16(load_idx01_2nd, M512_EPI16_2); + load_idx2_2nd = _mm512_mask_blend_epi16(idx_blend_mask, load_idx2_2nd, _mm512_setzero_si512()); + + for (BLASLONG idx_m = 0; idx_m < tag_m_32x; idx_m+=32) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*3]); // Load 10 rows with n=3 plus 2 element + matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+10)*3 + 2)]); // Load 10 rows with n=3 plus 2 element + matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+21)*3 + 1)]); // Load 10 rows with n=3 plus 2 element + + matrixArray_3 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_1st, matrixArray_1); // Select the first 2 elements for each row + matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_1, load_idx01_2nd, matrixArray_2); // Select the first 2 elements for each row + matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_0, load_idx2_1st, matrixArray_1); // Select the third element for each row + matrixArray_6 = _mm512_permutex2var_epi16(matrixArray_1, load_idx2_2nd, matrixArray_2); // Select the third element for each row + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_3, (__m512bh) xArray_0); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_5, (__m512bh) xArray_1); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_4, (__m512bh) xArray_0); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_6, (__m512bh) xArray_1); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_COMPLETE_RESULT(result_1, y+idx_m+16) + } + } + + if (tag_m_32x != m) { + __m256i load256_idx01_1st, load256_idx01_2nd, load256_idx2_1st, load256_idx2_2nd; + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6; + __m256 result256_0, result256_1; + + unsigned short idx256_blend_mask_value = ((unsigned short)0x8000); + __mmask16 idx256_blend_mask = *((__mmask16*) &idx256_blend_mask_value); + + load256_idx01_1st = _mm512_castsi512_si256(load_idx_base); + load256_idx01_2nd = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_8)); + load256_idx2_1st = _mm256_add_epi16(load256_idx01_1st, _mm512_castsi512_si256(M512_EPI16_2)); + load256_idx2_2nd = _mm256_add_epi16(load256_idx01_2nd, _mm512_castsi512_si256(M512_EPI16_2)); + load256_idx2_2nd = _mm256_mask_blend_epi16(idx256_blend_mask, load256_idx2_2nd, _mm256_setzero_si256()); + + if (m - tag_m_32x > 15) { + result256_0 = _mm256_setzero_ps(); + result256_1 = _mm256_setzero_ps(); + + matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_loadu_si256(&a[((tag_m_32x+10)*3 + 2)]); // Load 5 rows with n=3 plus 1 element + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1)); + + STORE8_COMPLETE_RESULT(result256_0, y+tag_m_32x) + STORE8_COMPLETE_RESULT(result256_1, y+tag_m_32x+8) + + tag_m_32x += 16; + } + + if (tag_m_32x != m) { + result256_0 = _mm256_setzero_ps(); + result256_1 = _mm256_setzero_ps(); + BLASLONG tail_num = m-tag_m_32x; + + if (tail_num > 10) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-10-1)*3+1))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_loadu_si256(&a[((tag_m_32x+5)*3 + 1)]); // Load 5 rows with n=3 plus 1 element + matrixArray256_2 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+10)*3 + 2)]); // Load m-tag_m_32x-10 rows + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1)); + } else if (tail_num > 5) { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-((tail_num-5-1)*3+2))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray256_0 = _mm256_loadu_si256(&a[(tag_m_32x)*3]); // Load 5 rows with n=3 plus 1 element + matrixArray256_1 = _mm256_maskz_loadu_epi16(tail_mask, &a[((tag_m_32x+5)*3+1)]); // Load m-tag_m_32x-5 rows + matrixArray256_2 = _mm256_setzero_si256(); + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_4 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx01_2nd, matrixArray256_2); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + matrixArray256_6 = _mm256_permutex2var_epi16(matrixArray256_1, load256_idx2_2nd, matrixArray256_2); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_4, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_1 = _mm256_dpbf16_ps(result256_1, (__m256bh) matrixArray256_6, (__m256bh) _mm512_castsi512_si256(xArray_1)); + } else { + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num*3))); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray256_0 = _mm256_maskz_loadu_epi16(tail_mask, &a[(tag_m_32x)*3]); // Load m-tag_m_32x rows + matrixArray256_1 = _mm256_setzero_si256(); + + matrixArray256_3 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx01_1st, matrixArray256_1); // Select the first 2 elements for each row + matrixArray256_5 = _mm256_permutex2var_epi16(matrixArray256_0, load256_idx2_1st, matrixArray256_1); // Select the third element for each row + + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_3, (__m256bh) _mm512_castsi512_si256(xArray_0)); + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray256_5, (__m256bh) _mm512_castsi512_si256(xArray_1)); + } + + unsigned short store_tail_mask_value = (((unsigned short)0xffff) >> (16-(tail_num))); + __mmask16 store_tail_mask = *((__mmask16*) &store_tail_mask_value); + __m512 result512 = _mm512_insertf32x8(_mm512_castps256_ps512(result256_0), result256_1, 0x1); + STORE16_MASK_COMPLETE_RESULT(result512, y+tag_m_32x, store_tail_mask) + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=4 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x4_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x4_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x4_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x4(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i xArray_01, xArray_23, xArray_remix; + __m512 result; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i idx_base_0 = _mm512_set_epi32(30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_1); + __m512i idx_base_remix = _mm512_inserti32x8(idx_base_0, _mm512_castsi512_si256(idx_base_1), 0x1); + + unsigned char x_load_mask_value = (((unsigned char)0xf) >> 2); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i xTmp = _mm_maskz_loadu_epi32(x_load_mask, x); // |x0|x1|x2|x3|0|0|0|0| + xArray_01 = _mm512_broadcastd_epi32(xTmp); // |x0|x1|x0|x1|...|x0|x1| + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(xTmp, 0x1)); // |x2|x3|x2|x3|...|x2|x3| + unsigned short blend_mask_value = ((unsigned short)0xff00); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + xArray_remix = _mm512_mask_blend_epi32(blend_mask, xArray_01, xArray_23); // |x0|x1|x0|x1|x0|x1|x0|x1|...|x2|x3|x2|x3|x2|x3|x2|x3| + + if (tag_m_16x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*4]); // Load 8 rows with n=4 + matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+8)*4]); // Load 8 rows with n=4 + + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_0, matrixArray_1); // |a0|a1|...|h0|h1|i0|i1|...|p0|p1| + matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_0, idx_base_1, matrixArray_1); // |a2|a3|...|h2|h3|i2|i3|...|p2|p3| + + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_01); + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_3, (__m512bh) xArray_23); + + STORE16_COMPLETE_RESULT(result, y+idx_m) + } + } + + if (m - tag_m_16x > 7) { + result = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*4]); // Load 8 rows with n=4 + matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0); // a0|a1|...|h0|h1|a2|a3|...|h2|h3| + + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1)); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + BLASLONG tail_num = m-tag_m_16x; + if (tail_num != 0) { + result = _mm512_setzero_ps(); + + unsigned short tail_mask_value = (((unsigned short)0xffff) >> (16-tail_num*2)); + __mmask16 tail_mask = *((__mmask16*) &tail_mask_value); + matrixArray_0 = _mm512_maskz_loadu_epi32(tail_mask, &a[(tag_m_16x)*4]); // Load 8 rows with n=4 + matrixArray_2 = _mm512_permutexvar_epi32(idx_base_remix, matrixArray_0); // a0|a1|...|h0|h1|a2|a3|...|h2|h3| + + result = _mm512_dpbf16_ps(result, (__m512bh) matrixArray_2, (__m512bh) xArray_remix); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result), _mm512_extractf32x8_ps(result, 1)); + + unsigned char store_tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 store_tail_mask = *((__mmask8*) &store_tail_mask_value); + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, store_tail_mask) + } + + return 0; +} + +// 30 rows parallel processing BF16 GEMV kernel for n=5 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_30x5_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_30x5_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_30x5_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_30x5(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_30x = m - (m%30); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 3); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|x3|x4|0|0|0| + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512 result_0, result_1; + __m512i xArray_01 = _mm512_broadcastd_epi32(x128); // x0|x1|x0|x1|...|x0|x1| + __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)); // x2|x3|x2|x3|...|x2|x3| + __m512i xArray_4 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)); // x4| 0|x4| 0|...|x4| 0| + + __m512i M512_EPI16_2 = _mm512_set1_epi16(2); + __m512i load_idx01_stage1_1st = _mm512_set_epi16( 0, 0, 0, 0, 0, 0, 0, 0, 58, 57, 53, 52, 48, 47, 43, 42, + 38, 37, 33, 32, 26, 25, 21, 20, 16, 15, 11, 10, 6, 5, 1, 0); + __m512i load_idx01_stage1_2nd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x39); + __m512i load_idx01_stage1_3rd = _mm512_shuffle_i32x4(load_idx01_stage1_1st, load_idx01_stage1_1st, 0x4f); + + __m512i load_idx23_stage1_1st = _mm512_add_epi16(load_idx01_stage1_1st, M512_EPI16_2); + __m512i load_idx23_stage1_2nd = _mm512_add_epi16(load_idx01_stage1_2nd, M512_EPI16_2); + __m512i load_idx23_stage1_3rd = _mm512_add_epi16(load_idx01_stage1_3rd, M512_EPI16_2); + + __m512i load_idx4_stage1_1st = _mm512_add_epi16(load_idx23_stage1_1st, M512_EPI16_2); + __m512i load_idx4_stage1_2nd = _mm512_add_epi16(load_idx23_stage1_2nd, M512_EPI16_2); + __m512i load_idx4_stage1_3rd = _mm512_add_epi16(load_idx23_stage1_3rd, M512_EPI16_2); + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4; + __m512i matrixArray_stage1_0, matrixArray_stage1_1, matrixArray_stage1_2; + __m512i matrixArray_stage2_0, matrixArray_stage2_1; + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + unsigned short store_mask_value = (((unsigned short)0xffff) >> 2); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + + if (tag_m_30x > 0) { + unsigned short blend_mask_value_0 = ((unsigned short)0xf000); + __mmask16 blend_mask_0 = *((__mmask16*) &blend_mask_value_0); + unsigned short blend_mask_value_1 = ((unsigned short)0x3f00); + __mmask16 blend_mask_1 = *((__mmask16*) &blend_mask_value_1); + for (BLASLONG idx_m = 0; idx_m < tag_m_30x; idx_m+=30) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]); // Load 6 rows with n=5 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]); // Load 6 rows with n=5 + matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+12)*5)]); // Load 6 rows with n=5 + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+18)*5)]); // Load 6 rows with n=5 + matrixArray_4 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+24)*5)]); // Load 6 rows with n=5 + + // Process the 0|1 elements + // Stage 1: Select the 0|1 elements for each row + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx01_stage1_2nd, matrixArray_3); + matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx01_stage1_3rd, matrixArray_4); + // Stage 2: Reorder and compress all the 0|1 elements + matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1); + matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2); + // Calculate the result of the 0|1 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_01); + + // Process the 2|3 elements + // Stage 1: Select the 2|3 elements for each row + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx23_stage1_2nd, matrixArray_3); + matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx23_stage1_3rd, matrixArray_4); + // Stage 2: Reorder and compress all the 2|3 elements + matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1); + matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2); + // Calculate the result of the 2|3 elements and accumulate the result of 0|1 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_23); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_23); + + // Process the for 4 elements + // Stage 1: Select the 4 elements for each row + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_2, load_idx4_stage1_2nd, matrixArray_3); + matrixArray_stage1_2 = _mm512_permutexvar_epi16(load_idx4_stage1_3rd, matrixArray_4); + // Stage 2: Reorder and compress all the 4 elements + matrixArray_stage2_0 = _mm512_mask_blend_epi32(blend_mask_0, matrixArray_stage1_0, matrixArray_stage1_1); + matrixArray_stage2_1 = _mm512_mask_blend_epi32(blend_mask_1, matrixArray_stage1_1, matrixArray_stage1_2); + // Calculate the result of the 4 element and accumulate the result of 0|1 and 2|3 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage2_0, (__m512bh) xArray_4); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage2_1, (__m512bh) xArray_4); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + STORE16_MASK_COMPLETE_RESULT(result_1, y+idx_m+16, store_mask) + } + } + + if (m - tag_m_30x > 11) { + BLASLONG tag_m_12x = m - ((m-tag_m_30x)%12); + for (BLASLONG idx_m = tag_m_30x; idx_m < tag_m_12x; idx_m+=12) { + unsigned short store_less_mask_value = (((unsigned short)0xffff) >> 4); + __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value); + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*5]); // Load 6 rows with n=5 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[((idx_m+6)*5)]); // Load 6 rows with n=5 + + // Interleave the elements + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1); + matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1); + // Calculate and accumulate the result + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_less_mask) + tag_m_30x += 12; + } + } + + BLASLONG tail_num = m - tag_m_30x; + if (tail_num > 6) { + unsigned short store_less_mask_value = (((unsigned short)0xffff) >> (4+(12-tail_num))); + __mmask16 store_less_mask = *((__mmask16*) &store_less_mask_value); + unsigned int load_less_mask_value = (((unsigned int)0xffffffff) >> (2+(12-tail_num)*5)); + __mmask32 load_less_mask = *((__mmask32*) &load_less_mask_value); + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_30x)*5]); // Load 6 rows with n=5 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_less_mask, &a[((tag_m_30x+6)*5)]); // Load x rows with n=5 + + // Interleave the elements + matrixArray_stage1_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx01_stage1_1st, matrixArray_1); + matrixArray_stage1_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx23_stage1_1st, matrixArray_1); + matrixArray_stage1_2 = _mm512_permutex2var_epi16(matrixArray_0, load_idx4_stage1_1st, matrixArray_1); + // Calculate and accumulate the result + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_0, (__m512bh) xArray_01); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage1_2, (__m512bh) xArray_4); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+tag_m_30x, store_less_mask) + } else { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_30x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*5]); // Load 1 rows with n=5 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=6 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x6_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x6_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x6_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x6(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 2); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // x0|x1|x2|x3|x4|x5|0|0| + + if (tag_m_16x > 0) { + __m512 result_0; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_1 = _mm512_set1_epi32(1); + __m512i load_idx01_1st = _mm512_set_epi32( 0, 0, 0, 0, 0, 30, 27, 24, 21, 18, 15, 12, 9, 6, 3, 0); + __m512i load_idx01_2nd = _mm512_set_epi32(13, 10, 7, 4, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0); + + __m512i load_idx23_1st = _mm512_add_epi32(load_idx01_1st, M512_EPI32_1); + __m512i load_idx23_2nd = _mm512_add_epi32(load_idx01_2nd, M512_EPI32_1); + + __m512i load_idx45_1st = _mm512_add_epi32(load_idx23_1st, M512_EPI32_1); + __m512i load_idx45_2nd = _mm512_add_epi32(load_idx23_2nd, M512_EPI32_1); + + unsigned short blend_mask_value = ((unsigned short)0x0400); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + // Set the 11th element to be 0 as invalid index for a 512 bit epi32 register + load_idx45_1st = _mm512_mask_blend_epi32(blend_mask, load_idx45_1st, load_idx01_2nd); + // Set the 11th element to be 0 as 0 is the correct index + load_idx45_2nd = _mm512_mask_blend_epi32(blend_mask, load_idx45_2nd, load_idx01_2nd); + + __m512i xArray_01 = _mm512_broadcastd_epi32(x128); // x0|x1|x0|x1|...|x0|x1| + __m512i xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1)); // x2|x3|x2|x3|...|x2|x3| + __m512i xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)); // x4|x5|x4|x5|...|x4|x5| + + unsigned short permute_mask01_uint = (((unsigned short)0xf800)); + __mmask16 permute_mask01 = *((__mmask16*) &permute_mask01_uint); + unsigned short permute_mask45_uint = (((unsigned short)0xfc00)); + __mmask16 permute_mask45 = *((__mmask16*) &permute_mask45_uint); + + __m512i matrixArray_0, matrixArray_1, matrixArray_2; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2; + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*6]); // Load 5 rows with n=6 plus 2 element + matrixArray_1 = _mm512_loadu_si512(&a[((idx_m+5)*6 + 2)]); // Load 5 rows with n=6 plus 2 element + matrixArray_2 = _mm512_loadu_si512(&a[((idx_m+10)*6 + 4)]); // Load 5 rows with n=6 plus 2 element + + // Stage 1: interleave for the a..k elements + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1); + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1); + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1); + + // Stage 2: interleave for the l..p elements and remix together + matrixArray_stage_0 = _mm512_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2); + matrixArray_stage_1 = _mm512_mask_permutexvar_epi32(matrixArray_stage_1, permute_mask01, load_idx23_2nd, matrixArray_2); + matrixArray_stage_2 = _mm512_mask_permutexvar_epi32(matrixArray_stage_2, permute_mask45, load_idx45_2nd, matrixArray_2); + + // Calculate the result of the 0|1 elements + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_01); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_45); + + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m256i M256_EPI32_1 = _mm512_castsi512_si256(M512_EPI32_1); + __m256i load_idx01_1st = _mm256_set_epi32( 0, 0, 15, 12, 9, 6, 3, 0); + __m256i load_idx01_2nd = _mm256_set_epi32( 5, 2, 0, 0, 0, 0, 0, 0); + + __m256i load_idx23_1st = _mm256_add_epi32(load_idx01_1st, M256_EPI32_1); + __m256i load_idx23_2nd = _mm256_add_epi32(load_idx01_2nd, M256_EPI32_1); + unsigned char blend_mask_value = ((unsigned char)0x20); + __mmask8 blend_mask = *((__mmask8*) &blend_mask_value); + // Set the 6th element to be 0 as invalid index for a 512 bit epi32 register + load_idx23_1st = _mm256_mask_blend_epi32(blend_mask, load_idx23_1st, load_idx01_2nd); + // Set the 6th element to be 0 as 0 is the correct index + load_idx23_2nd = _mm256_mask_blend_epi32(blend_mask, load_idx23_2nd, load_idx01_2nd); + + __m256i load_idx45_1st = _mm256_add_epi32(load_idx23_1st, M256_EPI32_1); + __m256i load_idx45_2nd = _mm256_add_epi32(load_idx23_2nd, M256_EPI32_1); + + unsigned char permute_mask01_uint = (((unsigned char)0xc0)); + __mmask8 permute_mask01 = *((__mmask8*) &permute_mask01_uint); + unsigned char permute_mask45_uint = (((unsigned char)0xe0)); + __mmask8 permute_mask45 = *((__mmask8*) &permute_mask45_uint); + + __m256i matrixArray_0, matrixArray_1, matrixArray_2; + __m256i matrixArray_stage_0; + __m256 result256_0; + + result256_0 = _mm256_setzero_ps(); + + matrixArray_0 = _mm256_loadu_si256(&a[(tag_m_16x)*6]); // Load 2 rows with n=6 plus 4 element + matrixArray_1 = _mm256_loadu_si256(&a[((tag_m_16x+2)*6 + 4)]); // Load 2 rows with n=6 plus 4 element + matrixArray_2 = _mm256_loadu_si256(&a[((tag_m_16x+5)*6 + 2)]); // Load 2 rows with n=6 plus 4 element + + // Process the 0|1 elements + // Select the 0|1 elements for each row + matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx01_1st, matrixArray_1); + matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask01, load_idx01_2nd, matrixArray_2); + // Calculate the result of the 0|1 elements + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_01)); + + // Process the 2|3 elements + // Select the 2|3 elements for each row + matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx23_1st, matrixArray_1); + matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx23_2nd, matrixArray_2); + // Calculate the result of the 0|1 elements + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_23)); + + // Process the for 4 elements + // Select the 4|5 elements for each row + matrixArray_stage_0 = _mm256_permutex2var_epi32(matrixArray_0, load_idx45_1st, matrixArray_1); + matrixArray_stage_0 = _mm256_mask_permutexvar_epi32(matrixArray_stage_0, permute_mask45, load_idx45_2nd, matrixArray_2); + // Calculate the result of the 0|1 elements + result256_0 = _mm256_dpbf16_ps(result256_0, (__m256bh) matrixArray_stage_0, (__m256bh) _mm512_castsi512_si256(xArray_45)); + + STORE8_COMPLETE_RESULT(result256_0, y+tag_m_16x) + tag_m_16x += 8; + } + } + + if (tag_m_16x != m) { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*6]); // Load 1 rows with n=6 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=7 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x7_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x7_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x7_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x7(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 1); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128 = _mm_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3; + __m512i xArray_0123, xArray_4567; + __m512 result_0, result_1, result_2, result_3; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_2 = _mm512_set1_epi32(2); + __m512i load_idx_stage1_0 = _mm512_set_epi16(31, 27, 26, 25, 24, 23, 22, 21, 31, 20, 19, 18, 17, 16, 15, 14, + 31, 13, 12, 11, 10, 9, 8, 7, 31, 6, 5, 4, 3, 2, 1, 0); + __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); + __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2); + + unsigned short x_blend_mask_value = ((unsigned short)0xff00); + __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value); + xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1))); + xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3))); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m)*7]); // Load 4 rows with n=7 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+4)*7]); // Load 4 rows with n=7 + matrixArray_2 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+8)*7]); // Load 4 rows with n=7 + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+12)*7]); // Load 4 rows with n=7 + + // Stage 1: padding + matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7| + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7| + matrixArray_2 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_2); // |i0|i1|i2|i3|...|j6|j7|k0|k1|k2|k3|...|l6|l7| + matrixArray_3 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_3); // |m0|m1|m2|m3|...|n6|n7|o0|o1|o2|o3|...|p6|p7| + + // Stage 2: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|...|h0|h1|a2|a3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|...|h4|h5|a6|a7|...|h6|h7| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3); // |i0|i1|...|p0|p1|i2|i3|...|p2|p3| + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3); // |i4|i5|...|p4|p5|i6|i7|...|p6|p7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567); + + // Stage 3: interleave per 256 bits + result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44); + result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee); + + result_2 = _mm512_add_ps(result_2, result_3); + + STORE16_COMPLETE_RESULT(result_2, y+idx_m) + } + + if (m - tag_m_16x > 7) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]); // Load 4 rows with n=7 + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x+4)*7]); // Load 4 rows with n=7 + + // Stage 1: padding + matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7| + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7| + + // Stage 2: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + + tag_m_16x += 8; + } + + BLASLONG tail_num = m - tag_m_16x; + if (tail_num > 3) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(tag_m_16x)*7]); // Load 4 rows with n=7 + unsigned int tail_load_mask_value = (((unsigned int)0xffffffff) >> (4+(8-tail_num)*7)); + __mmask32 tail_load_mask = *((__mmask32*) &tail_load_mask_value); + matrixArray_1 = _mm512_maskz_loadu_epi16(tail_load_mask, &a[(tag_m_16x+4)*7]); // Load 4 rows with n=7 + + // Stage 1: padding + matrixArray_0 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_0); // |a0|a1|a2|a3|...|b6|b7|c0|c1|c2|c3|...|d6|d7| + matrixArray_1 = _mm512_permutexvar_epi16(load_idx_stage1_0, matrixArray_1); // |e0|e1|e2|e3|...|f6|f7|g0|g1|g2|g3|...|h6|h7| + + // Stage 2: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask) + tag_m_16x = m; + } + } + + if (tag_m_16x != m) { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_maskz_loadu_epi16(x_load_mask, &a[(i)*7]); // Load 1 rows with n=7 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=8 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x8_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x8_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x8_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x8(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + __m128i x128 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3; + __m512i xArray_0123, xArray_4567; + __m512 result_0, result_1, result_2, result_3; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_2 = _mm512_set1_epi32(2); + __m512i load_idx_stage2_0 = _mm512_set_epi32(29, 25, 21, 17, 13, 9, 5, 1, 28, 24, 20, 16, 12, 8, 4, 0); + __m512i load_idx_stage2_1 = _mm512_add_epi32(load_idx_stage2_0, M512_EPI32_2); + + unsigned short x_blend_mask_value = ((unsigned short)0xff00); + __mmask16 x_blend_mask = *((__mmask16*) &x_blend_mask_value); + xArray_0123 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(x128), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x1))); + xArray_4567 = _mm512_mask_blend_epi32(x_blend_mask, _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x2)), \ + _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128, 0x3))); + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*8]); // Load 4 rows with n=8 + matrixArray_1 = _mm512_loadu_si512(&a[(idx_m+4)*8]); // Load 4 rows with n=8 + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+8)*8]); // Load 4 rows with n=8 + matrixArray_3 = _mm512_loadu_si512(&a[(idx_m+12)*8]); // Load 4 rows with n=8 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|...|h0|h1|a2|a3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|...|h4|h5|a6|a7|...|h6|h7| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_0, matrixArray_3); // |i0|i1|...|p0|p1|i2|i3|...|p2|p3| + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage2_1, matrixArray_3); // |i4|i5|...|p4|p5|i6|i7|...|p6|p7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_2, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_stage_3, (__m512bh) xArray_4567); + + // Stage 2: interleave per 256 bits + result_2 = _mm512_shuffle_f32x4(result_0, result_1, 0x44); + result_3 = _mm512_shuffle_f32x4(result_0, result_1, 0xee); + + result_2 = _mm512_add_ps(result_2, result_3); + + STORE16_COMPLETE_RESULT(result_2, y+idx_m) + } + + if (m - tag_m_16x > 7) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]); // Load 4 rows with n=8 + matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+4)*8]); // Load 4 rows with n=8 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + BLASLONG tail_num = m - tag_m_16x; + if (tail_num > 3) { + result_0 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x)*8]); // Load 4 rows with n=8 + unsigned short tail_load_mask_value = (((unsigned int)0xffff) >> ((8-tail_num)*4)); + __mmask16 tail_load_mask = *((__mmask16*) &tail_load_mask_value); + matrixArray_1 = _mm512_maskz_loadu_epi32(tail_load_mask, &a[(tag_m_16x+4)*8]); // Load 4 rows with n=8 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_0, matrixArray_1); // |a0|a1|b0|b1|...|h0|h1|a2|a3|b2|b3|...|h2|h3| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage2_1, matrixArray_1); // |a4|a5|b4|b5|...|h4|h5|a6|a7|b6|b7|...|h6|h7| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_0, (__m512bh) xArray_0123); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_stage_1, (__m512bh) xArray_4567); + + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(result_0), _mm512_extractf32x8_ps(result_0, 0x1)); + + unsigned char tail_mask_value = (((unsigned char)0xff) >> (8-tail_num)); + __mmask8 tail_mask = *((__mmask8*) &tail_mask_value); + STORE8_MASK_COMPLETE_RESULT(result256, y+tag_m_16x, tail_mask) + tag_m_16x = m; + } + } + + if (tag_m_16x != m) { + __m128i matrixArray128; + __m128 result128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + result128 = _mm_setzero_ps(); + matrixArray128 = _mm_loadu_si128(&a[(i)*8]); // Load 1 rows with n=8 + result128 = _mm_dpbf16_ps(result128, (__m128bh) matrixArray128, (__m128bh) x128); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 14 rows parallel processing BF16 GEMV kernel for n=9 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_14x9_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_14x9_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_14x9_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_14x9(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_14x = m - (m%14); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 7); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|0 |0 | 0| 0| 0| 0| 0| + + if (tag_m_14x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m256i M256_EPI16_2 = _mm256_set1_epi16(2); + __m256i idx_base_0 = _mm256_set_epi16( 0, 0, 55, 54, 46, 45, 37, 36, 28, 27, 19, 18, 10, 9, 1, 0); + __m256i idx_base_1 = _mm256_add_epi16(idx_base_0, M256_EPI16_2); + __m256i idx_base_2 = _mm256_add_epi16(idx_base_1, M256_EPI16_2); + __m256i idx_base_3 = _mm256_add_epi16(idx_base_2, M256_EPI16_2); + __m256i idx_base_4 = _mm256_add_epi16(idx_base_3, M256_EPI16_2); + __m512i idx_idx = _mm512_set_epi32( 0, 0, 22, 21, 20, 19, 18, 17, 16, 6, 5, 4, 3, 2, 1, 0); + + __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1)); + __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3)); + __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0)); + __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2)); + __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4)); + __m512i load_idx_stage2_0 = _mm512_set_epi32( 0, 0, 22, 21, 20, 19, 18, 17, 16, 13, 12, 11, 10, 9, 8, 7); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0|x1|x0|x1| ... |x0|x1| + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2|x3|x2|x3| ... |x2|x3| + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4|x5|x4|x5| ... |x4|x5| + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6|x7|x6|x7| ... |x6|x7| + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8|0 |x8| 0| ... |x8| 0| + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 1); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + unsigned short blend_mask_value = ((unsigned short)0x3f80); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + unsigned short store_mask_value = (((unsigned short)0xffff) >> 2); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + for (BLASLONG idx_m = 0; idx_m < tag_m_14x; idx_m+=14) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(idx_m)*9]); // Load 3 rows with n=9 plus 5 elements + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+3)*9 + 5]); // Load 3 rows with n=9 plus 4 elements + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+7)*9]); // Load 3 rows with n=9 plus 5 elements + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*9 + 5]); // Load 3 rows with n=9 plus 4 elements + + // Stage 1: interleave per 16 bits + matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_0, matrixArray_1); // |a0|a1|...|g0|g1|a2|a3|...|g2|g3|x|x|x|x| + matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_1, matrixArray_1); // |a4|a5|...|g4|g5|a6|a7|...|g6|g7|x|x|x|x| + matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_2, matrixArray_3); // |h2|h3|...|n2|n3|h0|h1|...|n0|n1|x|x|x|x| + matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_3, matrixArray_3); // |h6|h7|...|n6|n7|h4|h5|...|n4|n5|x|x|x|x| + matrixArray_4 = _mm512_permutex2var_epi16(matrixArray_0, load_idx_stage1_4, matrixArray_1); // |a8| x|...|g8| x| x| x|...| x| x|x|x|x|x| + matrixArray_5 = _mm512_permutex2var_epi16(matrixArray_2, load_idx_stage1_4, matrixArray_3); // | x| x|...| x| x|h8| x|...|n8| x|x|x|x|x| + + // Stage 2: interleave per 32 bits + matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|b0|b1|...|h0|h1|i0|i1|j0|j1|...|n0|n1|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2); // |a2|a3|b2|b3|...|h2|h3|i2|i3|j2|j3|...|n2|n3|x|x|x|x| + matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3); // |a4|a5|b4|b5|...|h4|h5|i4|i5|j4|j5|...|n4|n5|x|x|x|x| + matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3); // |a6|a7|b6|b7|...|h6|h7|i6|i7|j6|j7|...|n6|n7|x|x|x|x| + matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_4, matrixArray_5); // |a8| x|b8| x|...|h8| x|i8| x|j8| x|...|n8| x|x|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_14x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned short load256_mask_value = (((unsigned short)0xffff) >> 7); + __mmask16 load256_mask = *((__mmask16*) &load256_mask_value); + for (BLASLONG i = tag_m_14x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*9]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 12 rows parallel processing BF16 GEMV kernel for n=10 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_12x10_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_12x10_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_12x10_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_12x10(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_12x = m - (m%12); + + unsigned char x_load_mask_value = (((unsigned char)0xf) >> 3); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1|x2|x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi32(x_load_mask, (x+8)); // |x8|x9|0 | 0| 0| 0| 0| 0| + + if (tag_m_12x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m256i M256_EPI32_1 = _mm256_set1_epi32(1); + __m256i idx_base_0 = _mm256_set_epi32( 0, 0, 26, 21, 16, 10, 5, 0); + __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_1); + __m256i idx_base_2 = _mm256_add_epi32(idx_base_1, M256_EPI32_1); + __m256i idx_base_3 = _mm256_add_epi32(idx_base_2, M256_EPI32_1); + __m256i idx_base_4 = _mm256_add_epi32(idx_base_3, M256_EPI32_1); + __m512i idx_idx = _mm512_set_epi32( 0, 0, 0, 0, 21, 20, 19, 18, 17, 16, 5, 4, 3, 2, 1, 0); + + __m512i load_idx_stage1_0 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_0), idx_idx, _mm512_castsi256_si512(idx_base_1)); + __m512i load_idx_stage1_1 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_2), idx_idx, _mm512_castsi256_si512(idx_base_3)); + __m512i load_idx_stage1_2 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_1), idx_idx, _mm512_castsi256_si512(idx_base_0)); + __m512i load_idx_stage1_3 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_3), idx_idx, _mm512_castsi256_si512(idx_base_2)); + __m512i load_idx_stage1_4 = _mm512_permutex2var_epi32(_mm512_castsi256_si512(idx_base_4), idx_idx, _mm512_castsi256_si512(idx_base_4)); + __m512i load_idx_stage2_0 = _mm512_set_epi32( 0, 0, 0, 0, 21, 20, 19, 18, 17, 16, 11, 10, 9, 8, 7, 6); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0|x1|x0|x1| ... |x0|x1| + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2|x3|x2|x3| ... |x2|x3| + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4|x5|x4|x5| ... |x4|x5| + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6|x7|x6|x7| ... |x6|x7| + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8|x9|x8|x9| ... |x8|x9| + + unsigned short blend_mask_value = ((unsigned short)0x0fc0); + __mmask16 blend_mask = *((__mmask16*) &blend_mask_value); + unsigned short load_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 load_mask = *((__mmask16*) &load_mask_value); + unsigned short store_mask_value = (((unsigned short)0xffff) >> 4); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + for (BLASLONG idx_m = 0; idx_m < tag_m_12x; idx_m+=12) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m)*10]); // Load 3 rows with n=10 + matrixArray_1 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+3)*10]); // Load 3 rows with n=10 + matrixArray_2 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+6)*10]); // Load 3 rows with n=10 + matrixArray_3 = _mm512_maskz_loadu_epi32(load_mask, &a[(idx_m+9)*10]); // Load 3 rows with n=10 + + // Stage 1: interleave per 32 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_0, matrixArray_1); // |a0|a1|...|f0|f1|a2|a3|...|f2|f3|x|x|x|x|x|x|x|x| + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_1, matrixArray_1); // |a4|a5|...|f4|f5|a6|a7|...|f6|f7|x|x|x|x|x|x|x|x| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_2, matrixArray_3); // |g2|g3|...|l2|l3|g0|g1|...|l0|l1|x|x|x|x|x|x|x|x| + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_3, matrixArray_3); // |g6|g7|...|l6|l7|g4|g5|...|l4|l5|x|x|x|x|x|x|x|x| + matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_0, load_idx_stage1_4, matrixArray_1); // |a8|a9|...|f8|f9| x| x|...| x| x|x|x|x|x|x|x|x|x| + matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_2, load_idx_stage1_4, matrixArray_3); // | x| x|...| x| x|g8|g9|...|l8|l9|x|x|x|x|x|x|x|x| + + // Stage 3: interleave per 256 bits + matrixArray_0 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|...|l0|l1|x|x|x|x|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, load_idx_stage2_0, matrixArray_stage_2); // |a2|a3|...|l2|l3|x|x|x|x|x|x|x|x| + matrixArray_2 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_1, matrixArray_stage_3); // |a4|a5|...|l4|l5|x|x|x|x|x|x|x|x| + matrixArray_3 = _mm512_permutex2var_epi32(matrixArray_stage_1, load_idx_stage2_0, matrixArray_stage_3); // |a6|a7|...|l6|l7|x|x|x|x|x|x|x|x| + matrixArray_4 = _mm512_mask_blend_epi32(blend_mask, matrixArray_stage_4, matrixArray_stage_5); // |a8|a9|...|l8|l9|x|x|x|x|x|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_12x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned char load256_mask_value = (((unsigned char)0xff) >> 3); + __mmask8 load256_mask = *((__mmask8*) &load256_mask_value); + for (BLASLONG i = tag_m_12x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi32(load256_mask, &a[(i)*10]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 15 rows parallel processing BF16 GEMV kernel for n=11 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_15x11_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_15x11_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_15x11_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_15x11(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_15x = m - (m%15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 5); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2|x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10| 0| 0| 0| 0| 0| + + if (tag_m_15x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; + __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3; + + __m512i M512_EPI16_2, M512_EPI16_4, M512_EPI16_6, M512_EPI32_5; + M512_EPI16_2 = _mm512_set1_epi16(2); + M512_EPI16_4 = _mm512_add_epi16(M512_EPI16_2, M512_EPI16_2); + M512_EPI16_6 = _mm512_add_epi16(M512_EPI16_4, M512_EPI16_2); + M512_EPI32_5 = _mm512_set1_epi32(5); + + unsigned int BASE_MASK_10_value = ((unsigned int)0x000003ff); + __mmask32 BASE_MASK_10 = *((__mmask32*) &BASE_MASK_10_value); + unsigned int BASE_MASK_20_value = ((unsigned int)0x000ffc00); + __mmask32 BASE_MASK_20 = *((__mmask32*) &BASE_MASK_20_value); + unsigned int BASE_MASK_30_value = ((unsigned int)0x3ff00000); + __mmask32 BASE_MASK_30 = *((__mmask32*) &BASE_MASK_30_value); + + idx_stage1_base_0 = _mm512_set_epi16( 0, 0, 49, 48, 38, 37, 27, 26, 16, 15, 5, 4, 47, 46, 36, 35, + 25, 24, 14, 13, 3, 2, 45, 44, 34, 33, 23, 22, 12, 11, 1, 0); + idx_stage1_base_1 = _mm512_add_epi16(idx_stage1_base_0, M512_EPI16_6); + + idx_stage1_base_2 = _mm512_mask_add_epi16(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI16_2); + idx_stage1_base_2 = _mm512_mask_sub_epi16(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI16_2); + idx_stage1_base_3 = _mm512_add_epi16(idx_stage1_base_2, M512_EPI16_6); + + idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI16_2); + idx_stage1_base_4 = _mm512_mask_add_epi16(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI16_2); + idx_stage1_base_4 = _mm512_mask_sub_epi16(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI16_4); + idx_stage1_base_5 = _mm512_add_epi16(idx_stage1_base_4, M512_EPI16_6); + + unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0); + __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value); + unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00); + __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value); + idx_stage2_base_0 = _mm512_set_epi32( 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 9, 8, 7, 6, 5); + idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5); + idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5); + idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0 |x1 |x0 |x1 | ... |x0 |x1 | + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2 |x3 |x2 |x3 | ... |x2 |x3 | + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4 |x5 |x4 |x5 | ... |x4 |x5 | + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6 |x7 |x6 |x7 | ... |x6 |x7 | + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8 |x9 |x8 |x9 | ... |x8 |x9 | + xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1)); // |x10|0 |x10|0 | ... |x10|0 | + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 9); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + unsigned short store_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + + for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[idx_m*11]); // Load 2 rows with n=11 plus 10 elements + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*11 + 32]); // Load 2 rows with n=11 plus 1 element + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*11]); // Load 2 rows with n=11 plus 10 elements + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*11 + 32]); // Load 2 rows with n=11 plus 1 element + matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*11]); // Load 2 rows with n=11 plus 10 elements + matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*11 + 32]); // Load 2 rows with n=11 plus 1 element + + // Stage 1: interleave per 16 bits + matrixArray_stage_0 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_0, matrixArray_1); // |a0|a1|...|e0|e1|a2|a3|...|e2|e3|a4 |a5|...|e4 |e5| + matrixArray_stage_1 = _mm512_permutex2var_epi16(matrixArray_0, idx_stage1_base_1, matrixArray_1); // |a6|a7|...|e6|e7|a8|a9|...|e8|e9|a10|x |...|e10|x | + matrixArray_stage_2 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_2, matrixArray_3); // |f2|f3|...|j2|j3|f0|f1|...|j0|j1|f4 |f5|...|j4 |j5| + matrixArray_stage_3 = _mm512_permutex2var_epi16(matrixArray_2, idx_stage1_base_3, matrixArray_3); // |f8|f9|...|j8|j9|f6|f7|...|j6|j7|f10|x |...|j10|x | + matrixArray_stage_4 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_4, matrixArray_5); // |k4|k5|...|o4|o5|k2|k3|...|o2|o3|k0 |k1|...|o0 |o1| + matrixArray_stage_5 = _mm512_permutex2var_epi16(matrixArray_4, idx_stage1_base_5, matrixArray_5); // |k10|x|...|o10|x|k8|k9|...|o8|o9|k6 |k7|...|o6 |o7| + + // Stage 2: interleave per 32 bits + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2); // |a0|a1|...|j0|j1|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3); // |a6|a7|...|j6|j7|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2); // |a2|a3|...|j2|j3|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2); // |a4|a5|...|j4|j5|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3); // |a8|a9|...|j8|j9|x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3); // |a10|x|...|j10|x|x|x|x|x|x|x|x|x|x|x|x|x| + + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0, matrixArray_stage_4); // |a0|a1|.......................|o0|o1|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3, matrixArray_stage_5); // |a6|a7|.......................|o6|o7|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1 , idx_stage2_base_1, matrixArray_stage_4); // |a2|a3|.......................|o2|o3|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2 , idx_stage2_base_3, matrixArray_stage_4); // |a4|a5|.......................|o4|o5|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4 , idx_stage2_base_1, matrixArray_stage_5); // |a8|a9|.......................|o8|o9|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5 , idx_stage2_base_3, matrixArray_stage_5); // |a10|x|.......................|o10|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_15x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned short load256_mask_value = (((unsigned short)0xffff) >> 5); + __mmask16 load256_mask = *((__mmask16*) &load256_mask_value); + for (BLASLONG i = tag_m_15x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*11]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + +// 15 rows parallel processing BF16 GEMV kernel for n=12 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_15x12_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_15x12_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_15x12_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_15x12(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_15x = m - (m%15); + + unsigned char x_load_mask_value = (((unsigned char)0xff) >> 4); + __mmask8 x_load_mask = *((__mmask8*) &x_load_mask_value); + __m128i x128_0 = _mm_loadu_si128(x); // |x0|x1| x2| x3|x4|x5|x6|x7| + __m128i x128_1 = _mm_maskz_loadu_epi16(x_load_mask, (x+8)); // |x8|x9|x10|x11| 0| 0| 0| 0| + + if (tag_m_15x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5; + __m512i matrixArray_stage_0, matrixArray_stage_1, matrixArray_stage_2, matrixArray_stage_3, matrixArray_stage_4, matrixArray_stage_5; + __m512i xArray_01, xArray_23, xArray_45, xArray_67, xArray_89, xArray_10; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i idx_stage1_base_0, idx_stage1_base_1, idx_stage1_base_2, idx_stage1_base_3, idx_stage1_base_4, idx_stage1_base_5; + __m512i idx_stage2_base_0, idx_stage2_base_1, idx_stage2_base_2, idx_stage2_base_3; + + __m512i M512_EPI32_1, M512_EPI32_2, M512_EPI32_3, M512_EPI32_5; + M512_EPI32_1 = _mm512_set1_epi32(1); + M512_EPI32_2 = _mm512_add_epi32(M512_EPI32_1, M512_EPI32_1); + M512_EPI32_3 = _mm512_add_epi32(M512_EPI32_2, M512_EPI32_1); + M512_EPI32_5 = _mm512_add_epi32(M512_EPI32_3, M512_EPI32_2); + + unsigned short BASE_MASK_10_value = ((unsigned short)0x001f); + __mmask16 BASE_MASK_10 = *((__mmask16*) &BASE_MASK_10_value); + unsigned short BASE_MASK_20_value = ((unsigned short)0x03e0); + __mmask16 BASE_MASK_20 = *((__mmask16*) &BASE_MASK_20_value); + unsigned short BASE_MASK_30_value = ((unsigned short)0xfc00); + __mmask16 BASE_MASK_30 = *((__mmask16*) &BASE_MASK_30_value); + + idx_stage1_base_0 = _mm512_set_epi32( 0, 26, 20, 14, 8, 2, 25, 19, 13, 7, 1, 24, 18, 12, 6, 0); + idx_stage1_base_1 = _mm512_add_epi32(idx_stage1_base_0, M512_EPI32_3); + + idx_stage1_base_2 = _mm512_mask_add_epi32(idx_stage1_base_0, BASE_MASK_10, idx_stage1_base_0, M512_EPI32_1); + idx_stage1_base_2 = _mm512_mask_sub_epi32(idx_stage1_base_2, BASE_MASK_20, idx_stage1_base_0, M512_EPI32_1); + idx_stage1_base_3 = _mm512_add_epi32(idx_stage1_base_2, M512_EPI32_3); + + idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_2, BASE_MASK_10, idx_stage1_base_2, M512_EPI32_1); + idx_stage1_base_4 = _mm512_mask_add_epi32(idx_stage1_base_4, BASE_MASK_20, idx_stage1_base_2, M512_EPI32_1); + idx_stage1_base_4 = _mm512_mask_sub_epi32(idx_stage1_base_4, BASE_MASK_30, idx_stage1_base_2, M512_EPI32_2); + idx_stage1_base_5 = _mm512_add_epi32(idx_stage1_base_4, M512_EPI32_3); + + unsigned short idx_stage2_mask_1_value = ((unsigned short)0x03e0); + __mmask16 idx_stage2_mask_1 = *((__mmask16*) &idx_stage2_mask_1_value); + unsigned short idx_stage2_mask_2_value = ((unsigned short)0x7c00); + __mmask16 idx_stage2_mask_2 = *((__mmask16*) &idx_stage2_mask_2_value); + idx_stage2_base_0 = _mm512_set_epi32( 0, 0, 0, 0, 0, 0, 20, 19, 18, 17, 16, 9, 8, 7, 6, 5); + idx_stage2_base_1 = _mm512_set_epi32( 0, 25, 24, 23, 22, 21, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + idx_stage2_base_2 = _mm512_add_epi32(idx_stage2_base_0, M512_EPI32_5); + idx_stage2_base_2 = _mm512_mask_add_epi32(idx_stage2_base_2, idx_stage2_mask_1, idx_stage2_base_2, M512_EPI32_5); + idx_stage2_base_3 = _mm512_mask_sub_epi32(idx_stage2_base_1, idx_stage2_mask_2, idx_stage2_base_1, M512_EPI32_5); + + xArray_01 = _mm512_broadcastd_epi32(x128_0); // |x0 |x1 |x0 |x1 | ... |x0 |x1 | + xArray_23 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x1)); // |x2 |x3 |x2 |x3 | ... |x2 |x3 | + xArray_45 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x2)); // |x4 |x5 |x4 |x5 | ... |x4 |x5 | + xArray_67 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_0, 0x3)); // |x6 |x7 |x6 |x7 | ... |x6 |x7 | + xArray_89 = _mm512_broadcastd_epi32(x128_1); // |x8 |x9 |x8 |x9 | ... |x8 |x9 | + xArray_10 = _mm512_broadcastd_epi32(_mm_shuffle_epi32(x128_1, 0x1)); // |x10|x11|x10|x11| ... |x10|x11| + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + unsigned short store_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 store_mask = *((__mmask16*) &store_mask_value); + + for (BLASLONG idx_m = 0; idx_m < tag_m_15x; idx_m+=15) { + result_0 = _mm512_setzero_ps(); + result_1 = _mm512_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[idx_m*12]); // Load 2 rows with n=12 plus 8 elements + matrixArray_1 = _mm512_maskz_loadu_epi16(load_mask, &a[idx_m*12 + 32]); // Load 2 rows with n=12 plus 4 element + matrixArray_2 = _mm512_loadu_si512(&a[(idx_m+5)*12]); // Load 2 rows with n=12 plus 8 elements + matrixArray_3 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+5)*12 + 32]); // Load 2 rows with n=12 plus 4 element + matrixArray_4 = _mm512_loadu_si512(&a[(idx_m+10)*12]); // Load 2 rows with n=12 plus 8 elements + matrixArray_5 = _mm512_maskz_loadu_epi16(load_mask, &a[(idx_m+10)*12 + 32]); // Load 2 rows with n=12 plus 4 element + + // Stage 1: interleave per 16 bits + matrixArray_stage_0 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_0, matrixArray_1); // |a0 |a1 |...|e0 |e1 |a2|a3|...|e2|e3|a4 |a5 |...|e4 |e5 | + matrixArray_stage_1 = _mm512_permutex2var_epi32(matrixArray_0, idx_stage1_base_1, matrixArray_1); // |a6 |a7 |...|e6 |e7 |a8|a9|...|e8|e9|a10|a11|...|e10|e11| + matrixArray_stage_2 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_2, matrixArray_3); // |f2 |f3 |...|j2 |j3 |f0|f1|...|j0|j1|f4 |f5 |...|j4 |j5 | + matrixArray_stage_3 = _mm512_permutex2var_epi32(matrixArray_2, idx_stage1_base_3, matrixArray_3); // |f8 |f9 |...|j8 |j9 |f6|f7|...|j6|j7|f10|f11|...|j10|j11| + matrixArray_stage_4 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_4, matrixArray_5); // |k4 |k5 |...|o4 |o5 |k2|k3|...|o2|o3|k0 |k1 |...|o0 |o1 | + matrixArray_stage_5 = _mm512_permutex2var_epi32(matrixArray_4, idx_stage1_base_5, matrixArray_5); // |k10|k11|...|o10|o11|k8|k9|...|o8|o9|k6 |k7 |...|o6 |o7 | + + // Stage 2: interleave per 32 bits + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_0, matrixArray_stage_2); // |a0 |a1 |...|j0 |j1 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_1, matrixArray_stage_1, matrixArray_stage_3); // |a6 |a7 |...|j6 |j7 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_0, matrixArray_stage_2); // |a2 |a3 |...|j2 |j3 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_stage_0, idx_stage2_base_2, matrixArray_stage_2); // |a4 |a5 |...|j4 |j5 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_0, matrixArray_stage_3); // |a8 |a9 |...|j8 |j9 |x|x|x|x|x|x|x|x|x|x|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_stage_1, idx_stage2_base_2, matrixArray_stage_3); // |a10|a11|...|j10|j11|x|x|x|x|x|x|x|x|x|x|x|x| + + matrixArray_0 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_0, matrixArray_stage_4); // |a0|a1|.......................|o0|o1|x|x| + matrixArray_3 = _mm512_mask_blend_epi32(idx_stage2_mask_2, matrixArray_3, matrixArray_stage_5); // |a6|a7|.......................|o6|o7|x|x| + matrixArray_1 = _mm512_permutex2var_epi32(matrixArray_1 , idx_stage2_base_1, matrixArray_stage_4); // |a2|a3|.......................|o2|o3|x|x| + matrixArray_2 = _mm512_permutex2var_epi32(matrixArray_2 , idx_stage2_base_3, matrixArray_stage_4); // |a4|a5|.......................|o4|o5|x|x| + matrixArray_4 = _mm512_permutex2var_epi32(matrixArray_4 , idx_stage2_base_1, matrixArray_stage_5); // |a8|a9|.......................|o8|o9|x|x| + matrixArray_5 = _mm512_permutex2var_epi32(matrixArray_5 , idx_stage2_base_3, matrixArray_stage_5); // |a10|x|.......................|o10|x|x|x| + + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_0, (__m512bh) xArray_01); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_1, (__m512bh) xArray_23); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_2, (__m512bh) xArray_45); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_3, (__m512bh) xArray_67); + result_0 = _mm512_dpbf16_ps(result_0, (__m512bh) matrixArray_4, (__m512bh) xArray_89); + result_1 = _mm512_dpbf16_ps(result_1, (__m512bh) matrixArray_5, (__m512bh) xArray_10); + result_0 = _mm512_add_ps(result_0, result_1); + + STORE16_MASK_COMPLETE_RESULT(result_0, y+idx_m, store_mask) + } + } + + if (tag_m_15x != m) { + __m256i matrixArray256; + __m256i x256 = _mm256_insertf128_si256(_mm256_castsi128_si256(x128_0), x128_1, 0x1); + __m256 result256; + __m128 result128, tmp128; + unsigned short load256_mask_value = (((unsigned short)0xffff) >> 4); + __mmask16 load256_mask = *((__mmask16*) &load256_mask_value); + for (BLASLONG i = tag_m_15x; i < m; i++) { + result256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(load256_mask, &a[(i)*12]); + result256 = _mm256_dpbf16_ps(result256, (__m256bh) matrixArray256, (__m256bh) x256); + result128 = _mm_add_ps(_mm256_castps256_ps128(result256), _mm256_extractf128_ps(result256, 0x1)); + tmp128 = _mm_shuffle_ps(result128, result128, 14); + result128 = _mm_add_ps(result128, tmp128); + tmp128 = _mm_shuffle_ps(result128, result128, 1); + result128 = _mm_add_ps(result128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * result128[0] + beta * y[i]; +#else + y[i] = alpha * result128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = result128[0] * alpha; +#else + y[i] = result128[0]; +#endif +#endif + } + } + + return 0; +} + + +// 16 rows parallel processing BF16 GEMV kernel for n=13 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x13_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x13_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x13_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x13(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 3); + __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value); + __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|0|0|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 6); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, idx_m+8, 0, x_load_mask) + + matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0x44); + matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0xee); + matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0x44); + matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0xee); + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 13, tag_m_16x, 0, x_load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*13]); // Load 1 rows with n=13 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=14 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x14_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x14_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x14_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x14(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 2); + __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value); + __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|0|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + __m512i shift_idx = _mm512_set_epi32(0, 13, 12, 11, 10, 9, 8, 7, 0, 6, 5, 4, 3, 2, 1, 0); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 4); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x32_2(matrixArray, a, 14, idx_m, 0, load_mask) + + // Pre-stage: shift the 2nd vector 1 position right for each register + BF16_PERMUTE_8x32_2(shift_idx, matrixArray) + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_4x32_2(matrixArray, a, 14, tag_m_16x, 0, load_mask) + + // Pre-stage: shift the 2nd vector 1 position right for each register + BF16_PERMUTE_4x32_2(shift_idx, matrixArray) + + // interleave per 256 bits + BF16_INTERLEAVE256_4x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 14, tag_m_16x, 0, x_load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*14]); // Load 1 rows with n=14 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=15 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x15_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x15_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x15_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x15(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + unsigned short x_load_mask_value = (((unsigned short)0xffff) >> 1); + __mmask16 x_load_mask = *((__mmask16*) &x_load_mask_value); + __m256i x256 = _mm256_maskz_loadu_epi16(x_load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|0| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> 2); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, idx_m+8, 0, x_load_mask) + + matrixArray_12 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_13 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_14 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_15 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load matrix + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask) + + matrixArray_8 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_0), matrixArray256_1, 0x1); + matrixArray_9 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_2), matrixArray256_3, 0x1); + matrixArray_10 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_4), matrixArray256_5, 0x1); + matrixArray_11 = _mm512_inserti32x8(_mm512_castsi256_si512(matrixArray256_6), matrixArray256_7, 0x1); + + // interleave per 256 bits + matrixArray_0 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0x44); + matrixArray_1 = _mm512_shuffle_i32x4(matrixArray_8, matrixArray_10, 0xee); + matrixArray_2 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0x44); + matrixArray_3 = _mm512_shuffle_i32x4(matrixArray_9, matrixArray_11, 0xee); + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_4x16(matrixArray256, a, 15, tag_m_16x, 0, x_load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_maskz_loadu_epi16(x_load_mask, &a[(i)*15]); // Load 1 rows with n=15 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 16 rows parallel processing BF16 GEMV kernel for n=16 && lda ineffective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_16x16_alpha_beta(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_16x16_alpha_one(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_16x16_alpha(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_16x16(BLASLONG m, float alpha, bfloat16 *a, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_16x = m & (~15); + + __m256i x256 = _mm256_loadu_si256(x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15| + + if (tag_m_16x > 0) { + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + __m512 accum512_0, accum512_1; + __m512 result_0, result_1; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + // Prepare X with 2-step interleave way + xArray_0 = _mm512_inserti32x8(_mm512_castsi256_si512(x256), x256, 0x1); + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_16x; idx_m+=16) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + matrixArray_8 = _mm512_loadu_si512(&a[(idx_m )*16]); // Load 2 rows with n=16 + matrixArray_9 = _mm512_loadu_si512(&a[(idx_m+2 )*16]); // Load 2 rows with n=16 + matrixArray_10 = _mm512_loadu_si512(&a[(idx_m+4 )*16]); // Load 2 rows with n=16 + matrixArray_11 = _mm512_loadu_si512(&a[(idx_m+6 )*16]); // Load 2 rows with n=16 + matrixArray_12 = _mm512_loadu_si512(&a[(idx_m+8 )*16]); // Load 2 rows with n=16 + matrixArray_13 = _mm512_loadu_si512(&a[(idx_m+10)*16]); // Load 2 rows with n=16 + matrixArray_14 = _mm512_loadu_si512(&a[(idx_m+12)*16]); // Load 2 rows with n=16 + matrixArray_15 = _mm512_loadu_si512(&a[(idx_m+14)*16]); // Load 2 rows with n=16 + + // interleave per 256 bits + BF16_INTERLEAVE256_8x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..p[0:15] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + result_0 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + result_1 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + result_0 = _mm512_add_ps(result_0, result_1); + STORE16_COMPLETE_RESULT(result_0, y+idx_m) + } + + if (m - tag_m_16x > 7) { + __m512i permutevar_idx = _mm512_set_epi32(15, 14, 13, 12, 7, 6, 5, 4, 11, 10, 9, 8, 3, 2, 1, 0); + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + matrixArray_4 = _mm512_loadu_si512(&a[(tag_m_16x )*16]); // Load 2 rows with n=16 + matrixArray_5 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]); // Load 2 rows with n=16 + matrixArray_6 = _mm512_loadu_si512(&a[(tag_m_16x+4 )*16]); // Load 2 rows with n=16 + matrixArray_7 = _mm512_loadu_si512(&a[(tag_m_16x+6 )*16]); // Load 2 rows with n=16 + + // interleave per 256 bits + BF16_INTERLEAVE256_4x32(matrixArray) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..h[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum512_0 = _mm512_permutexvar_ps(permutevar_idx, accum512_0); + __m256 result256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + STORE8_COMPLETE_RESULT(result256, y+tag_m_16x) + tag_m_16x += 8; + } + + if (m - tag_m_16x > 3) { + __m256i matrixArray256_0, matrixArray256_1, matrixArray256_2, matrixArray256_3, \ + matrixArray256_4, matrixArray256_5, matrixArray256_6, matrixArray256_7; + __m256i xArray256_0, xArray256_1, xArray256_2, xArray256_3; + __m256 accum256_0, accum256_1; + + xArray256_0 = _mm512_castsi512_si256(xArray_0); + xArray256_1 = _mm512_castsi512_si256(xArray_1); + xArray256_2 = _mm512_castsi512_si256(xArray_2); + xArray256_3 = _mm512_castsi512_si256(xArray_3); + + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + + matrixArray_0 = _mm512_loadu_si512(&a[(tag_m_16x )*16]); // Load 2 rows with n=16 + matrixArray_1 = _mm512_loadu_si512(&a[(tag_m_16x+2 )*16]); // Load 2 rows with n=16 + + matrixArray256_0 = _mm512_castsi512_si256(matrixArray_0); + matrixArray256_1 = _mm512_extracti32x8_epi32(matrixArray_0, 0x1); + matrixArray256_2 = _mm512_castsi512_si256(matrixArray_1); + matrixArray256_3 = _mm512_extracti32x8_epi32(matrixArray_1, 0x1); + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x16(matrixArray256) + + // Calculate the temp result for a..d[0:15] + BF16_2STEP_INTERLEAVED_DOT_4x16(accum256, matrixArray256, xArray256) + + accum256_0 = _mm256_add_ps(accum256_0, accum256_1); + __m128 result128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + STORE4_COMPLETE_RESULT(result128, y+tag_m_16x) + tag_m_16x += 4; + } + } + + if (tag_m_16x != m) { + __m256i matrixArray256; + __m256 accum256; + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_16x; i < m; i++) { + accum256 = _mm256_setzero_ps(); + matrixArray256 = _mm256_loadu_si256(&a[(i)*16]); // Load 1 rows with n=16 + accum256 = _mm256_dpbf16_ps(accum256, (__m256bh) matrixArray256, (__m256bh) x256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for n>16 && lda effective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_8x16p_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_8x16p_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_8x16p_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_8x16p_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + + unsigned int load_mask_value = (((unsigned int)0xffffffff) >> (32-n)); + __mmask32 load_mask = *((__mmask32*) &load_mask_value); + __m512i x512 = _mm512_maskz_loadu_epi16(load_mask, x); // |x0|x1|x2|x3|x4|x5|x6|x7|x8|x9|x10|x11|x12|x13|x14|x15|... + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7, \ + matrixArray_8, matrixArray_9, matrixArray_10, matrixArray_11, matrixArray_12, matrixArray_13, matrixArray_14, matrixArray_15; + __m512 accum512_0, accum512_1, accum512_2, accum512_3; + __m256 accum256; + __m128 accum128; + + if (tag_m_8x > 0) { + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + // Prepare X with 2-step interleave way + xArray_0 = x512; + BF16_INTERLEAVE_1x32(xArray) + + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load 8 rows from matrix + BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, 0, load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_8x32(matrixArray) + + // Calculate the temp result for a..h[0:31] + BF16_2STEP_INTERLEAVED_DOT_8x32(accum512, matrixArray, xArray) + + // Reorder and add up the final result + accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1); + accum512_3 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1); + accum512_2 = _mm512_add_ps(accum512_2, accum512_3); + accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_2), _mm512_extractf32x8_ps(accum512_2, 1)); + STORE8_COMPLETE_RESULT(accum256, y+idx_m) + } + + if (m - tag_m_8x > 3) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + + // Load 4 rows from matrix + BF16_MATRIX_MASKZ_LOAD_4x32(matrixArray, a, lda, tag_m_8x, 0, load_mask) + + // 2-step interleave for matrix + BF16_INTERLEAVE_4x32(matrixArray) + + // Calculate the temp result for a..d[0:31] + BF16_2STEP_INTERLEAVED_DOT_4x32(accum512, matrixArray, xArray) + + accum512_0 = _mm512_add_ps(accum512_0, accum512_1); + accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + STORE4_COMPLETE_RESULT(accum128, y+tag_m_8x) + tag_m_8x += 4; + } + } + + if (tag_m_8x != m) { + __m128 tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum512_0 = _mm512_setzero_ps(); + matrixArray_0 = _mm512_maskz_loadu_epi16(load_mask, &a[(i)*lda]); // Load 1 rows with n=16 + accum512_0 = _mm512_dpbf16_ps(accum512_0, (__m512bh) matrixArray_0, (__m512bh) x512); + accum256 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256), _mm256_extractf32x4_ps(accum256, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave) +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_1x128_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_1x128_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_1x128_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_1x128_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_n_32x = n & (~31); + BLASLONG tag_n_128x = n & (~127); + + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ + accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; + __m512 accum512_bridge[8]; + __m512 accum512_t_0, accum512_t_1, accum512_t_2, accum512_t_3; + __m256 accum256_0; + __m128 accum128; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3; + __m512i xArray_0, xArray_1, xArray_2, xArray_3; + + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + if (tag_m_8x > 0) { + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + for (int j = idx_m; j < idx_m + 8; j++) { + accum512_t_0 = _mm512_setzero_ps(); + accum512_t_1 = _mm512_setzero_ps(); + accum512_t_2 = _mm512_setzero_ps(); + accum512_t_3 = _mm512_setzero_ps(); + /* Processing the main chunk with 128-elements per round */ + for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n + 0) + BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32) + BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64) + BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96) + + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0) + BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32) + BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64) + BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96) + + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1) + BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2) + BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3) + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n) + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + } + + /* Processing the remaining <32 chunk with masked 32-elements processing */ + if ((n&31) != 0) { + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask) + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0) + } + + /* Accumulate the 4 registers into 1 register */ + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1); + accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3); + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2); + + // Temply save the result into a ZMM + accum512_bridge[j-idx_m] = accum512_t_0; + } + + FP32_INTERLEAVE_8x16_ARRAY(accum512_bridge) + FP32_ACCUM2_8x16_ARRAY(accum512_bridge) + accum512_bridge[1] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_0, accum512_bridge[4]); + accum512_bridge[2] = _mm512_permutex2var_ps(accum512_bridge[0], idx_base_1, accum512_bridge[4]); + accum512_bridge[1] = _mm512_add_ps(accum512_bridge[1], accum512_bridge[2]); + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_bridge[1]), _mm512_extractf32x8_ps(accum512_bridge[1], 1)); + STORE8_COMPLETE_RESULT(accum256_0, y+idx_m) + } + } + + if (tag_m_8x != m) { + __m128 tmp128; + for (BLASLONG j = tag_m_8x; j < m; j++) { + accum512_t_0 = _mm512_setzero_ps(); + accum512_t_1 = _mm512_setzero_ps(); + accum512_t_2 = _mm512_setzero_ps(); + accum512_t_3 = _mm512_setzero_ps(); + /* Processing the main chunk with 128-elements per round */ + for (long idx_n = 0; idx_n < tag_n_128x; idx_n += 128) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n + 0) + BF16_MATRIX_LOAD_1x32(matrixArray_1, a, lda, j, idx_n + 32) + BF16_MATRIX_LOAD_1x32(matrixArray_2, a, lda, j, idx_n + 64) + BF16_MATRIX_LOAD_1x32(matrixArray_3, a, lda, j, idx_n + 96) + + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n + 0) + BF16_VECTOR_LOAD_1x32(xArray_1, x, idx_n + 32) + BF16_VECTOR_LOAD_1x32(xArray_2, x, idx_n + 64) + BF16_VECTOR_LOAD_1x32(xArray_3, x, idx_n + 96) + + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + BF16_DOT_1x32(accum512_t_1, matrixArray_1, xArray_1) + BF16_DOT_1x32(accum512_t_2, matrixArray_2, xArray_2) + BF16_DOT_1x32(accum512_t_3, matrixArray_3, xArray_3) + } + + /* Processing the remaining <128 chunk with 32-elements per round */ + for (long idx_n = tag_n_128x; idx_n < tag_n_32x; idx_n += 32) { + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, j, idx_n) + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + BF16_DOT_1x32(accum512_t_0, matrixArray_0, xArray_0) + } + + /* Processing the remaining <32 chunk with masked 32-elements processing */ + if ((n&31) != 0) { + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, j, tag_n_32x, tail_mask) + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + BF16_DOT_1x32(accum512_t_2, matrixArray_0, xArray_0) + } + + /* Accumulate the 4 registers into 1 register */ + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_1); + accum512_t_2 = _mm512_add_ps(accum512_t_2, accum512_t_3); + accum512_t_0 = _mm512_add_ps(accum512_t_0, accum512_t_2); + + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_t_0), _mm512_extractf32x8_ps(accum512_t_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[j] = alpha * accum128[0] + beta * y[j]; +#else + y[j] = alpha * accum128[0] + y[j]; +#endif +#else +#ifndef ONE_ALPHA + y[j] = accum128[0] * alpha; +#else + y[j] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for n=32 && lda effective scenario (process before interleave) +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_8x32_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_8x32_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_8x32_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_8x32_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + BLASLONG tag_n_32x = n & (~31); + + __m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \ + accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15; + __m256 accum256_0; + __m128 accum128; + +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_set1_ps(alpha); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_set1_ps(beta); +#endif + + __m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m512i xArray_0; + + unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(n&31))); + __mmask32 tail_mask = *((__mmask32*) &tail_mask_value); + + if (tag_m_8x > 0) { + __m512i M512_EPI32_4 = _mm512_set1_epi32(4); + __m512i idx_base_0 = _mm512_set_epi32(27, 26, 25, 24, 11, 10, 9, 8, 19, 18, 17, 16, 3, 2, 1, 0); + __m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_4); + + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum512_0 = _mm512_setzero_ps(); + accum512_1 = _mm512_setzero_ps(); + accum512_2 = _mm512_setzero_ps(); + accum512_3 = _mm512_setzero_ps(); + accum512_4 = _mm512_setzero_ps(); + accum512_5 = _mm512_setzero_ps(); + accum512_6 = _mm512_setzero_ps(); + accum512_7 = _mm512_setzero_ps(); + + for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) { + // Load 8 rows from matrix + BF16_MATRIX_LOAD_8x32(matrixArray, a, lda, idx_m, idx_n) + + // Load x + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + + // Calculate the temp result for a..h[0:31] + BF16_DOT_8x32(accum512, matrixArray, xArray_0) + } + + if (tag_n_32x != n) { // Go with masked 512 + // Load 8 rows from matrix + BF16_MATRIX_MASKZ_LOAD_8x32(matrixArray, a, lda, idx_m, tag_n_32x, tail_mask) + + // Load x + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + + // Calculate the temp result for a..h[0:31] + BF16_DOT_8x32(accum512, matrixArray, xArray_0) + } + + // 2-step interleave for FP32 regsiter array + FP32_INTERLEAVE_8x16(accum512) + + // Accumulate the 2 batch of registers into 2 register (0 and 4) + FP32_ACCUM2_8x16(accum512) + + accum512_1 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_4); + accum512_2 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_4); + accum512_1 = _mm512_add_ps(accum512_1, accum512_2); + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_1), _mm512_extractf32x8_ps(accum512_1, 1)); + STORE8_COMPLETE_RESULT(accum256_0, y+idx_m) + } + } + + if (tag_m_8x != m) { + __m128 tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum512_0 = _mm512_setzero_ps(); + for (BLASLONG idx_n = 0; idx_n < tag_n_32x; idx_n+=32) { + // Load 32 elements from matrix + BF16_MATRIX_LOAD_1x32(matrixArray_0, a, lda, i, idx_n) + + // Load 32 elements from x + BF16_VECTOR_LOAD_1x32(xArray_0, x, idx_n) + + // Calculate and accumulate the temp result + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + } + + if (tag_n_32x != n) { + // Load tail elements from matrix + BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_0, a, lda, i, tag_n_32x, tail_mask) + + // Load 32 elements from x + BF16_VECTOR_MASKZ_LOAD_1x32(xArray_0, x, tag_n_32x, tail_mask) + + // Calculate and accumulate the temp result + BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0) + } + + accum256_0 = _mm256_add_ps(_mm512_castps512_ps256(accum512_0), _mm512_extractf32x8_ps(accum512_0, 1)); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + + return 0; +} + +// 8 rows parallel processing BF16 GEMV kernel for n<16 && lda effective scenario +#ifndef ZERO_BETA +#ifndef ONE_BETA +static int sbgemv_kernel_8x16m_lda_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#else +static int sbgemv_kernel_8x16m_lda_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y) +#endif +#else +#ifndef ONE_ALPHA +static int sbgemv_kernel_8x16m_lda_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#else +static int sbgemv_kernel_8x16m_lda(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y) +#endif +#endif +{ + BLASLONG tag_m_8x = m & (~7); + + __m256i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7; + __m256i xArray256; + + // Keep align with other kernels and macro definition, the high 256bit is never used +#ifndef ONE_ALPHA + __m512 ALPHAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(alpha)); +#endif +#ifndef ZERO_BETA + __m512 BETAVECTOR = _mm512_castps256_ps512(_mm256_set1_ps(beta)); +#endif + + __m256 accum256_0, accum256_1, accum256_2, accum256_3, accum256_4, accum256_5, accum256_6, accum256_7, \ + accum256_8, accum256_9, accum256_10, accum256_11, accum256_12, accum256_13, accum256_14, accum256_15; + + __m256i M256_EPI32_4 = _mm256_set1_epi32(4); + __m256i idx_base_0 = _mm256_set_epi32(11, 10, 9, 8, 3, 2, 1, 0); + __m256i idx_base_1 = _mm256_add_epi32(idx_base_0, M256_EPI32_4); + + unsigned short load_mask_value = (((unsigned short)0xffff) >> (16-n)); + __mmask16 load_mask = *((__mmask16*) &load_mask_value); + + if (n == 16) { + BF16_VECTOR_LOAD_1x16(xArray256, x, 0) + } else { + BF16_VECTOR_MASKZ_LOAD_1x16(xArray256, x, 0, load_mask) + } + + if (n == 16) { + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + accum256_2 = _mm256_setzero_ps(); + accum256_3 = _mm256_setzero_ps(); + accum256_4 = _mm256_setzero_ps(); + accum256_5 = _mm256_setzero_ps(); + accum256_6 = _mm256_setzero_ps(); + accum256_7 = _mm256_setzero_ps(); + + BF16_MATRIX_LOAD_8x16(matrixArray, a, lda, idx_m, 0) + + BF16_DOT_8x16(accum256, matrixArray, xArray256) + + // 2-step interleave for FP32 regsiter array + FP32_INTERLEAVE_8x8(accum256) + + // Accumulate the 2 batch of registers into 2 register (0 and 4) + FP32_ACCUM2_8x8(accum256) + + accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4); + accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4); + accum256_1 = _mm256_add_ps(accum256_1, accum256_2); + + STORE8_COMPLETE_RESULT(accum256_1, y+idx_m) + } + + if (tag_m_8x != m) { + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum256_0 = _mm256_setzero_ps(); + matrixArray_0 = _mm256_loadu_si256(&a[(i)*lda]); // Load 1 rows with n=16 + accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); + y[i] += accum128[0] * alpha; + } + } + } else { + for (BLASLONG idx_m = 0; idx_m < tag_m_8x; idx_m+=8) { + accum256_0 = _mm256_setzero_ps(); + accum256_1 = _mm256_setzero_ps(); + accum256_2 = _mm256_setzero_ps(); + accum256_3 = _mm256_setzero_ps(); + accum256_4 = _mm256_setzero_ps(); + accum256_5 = _mm256_setzero_ps(); + accum256_6 = _mm256_setzero_ps(); + accum256_7 = _mm256_setzero_ps(); + + BF16_MATRIX_MASKZ_LOAD_8x16(matrixArray, a, lda, idx_m, 0, load_mask) + + BF16_DOT_8x16(accum256, matrixArray, xArray256) + + // 2-step interleave for FP32 regsiter array + FP32_INTERLEAVE_8x8(accum256) + + // Accumulate the 2 batch of registers into 2 register (0 and 4) + FP32_ACCUM2_8x8(accum256) + + accum256_1 = _mm256_permutex2var_ps(accum256_0, idx_base_0, accum256_4); + accum256_2 = _mm256_permutex2var_ps(accum256_0, idx_base_1, accum256_4); + accum256_1 = _mm256_add_ps(accum256_1, accum256_2); + + STORE8_COMPLETE_RESULT(accum256_1, y+idx_m) + } + + if (tag_m_8x != m) { + __m128 accum128, tmp128; + for (BLASLONG i = tag_m_8x; i < m; i++) { + accum256_0 = _mm256_setzero_ps(); + matrixArray_0 = _mm256_maskz_loadu_epi16(load_mask, &a[(i)*lda]); // Load 1 rows with n=16 + accum256_0 = _mm256_dpbf16_ps(accum256_0, (__m256bh) matrixArray_0, (__m256bh) xArray256); + accum128 = _mm_add_ps(_mm256_castps256_ps128(accum256_0), _mm256_extractf32x4_ps(accum256_0, 1)); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x0e); + accum128 = _mm_add_ps(accum128, tmp128); + tmp128 = _mm_shuffle_ps(accum128, accum128, 0x01); + accum128 = _mm_add_ps(accum128, tmp128); +#ifndef ZERO_BETA +#ifndef ONE_BETA + y[i] = alpha * accum128[0] + beta * y[i]; +#else + y[i] = alpha * accum128[0] + y[i]; +#endif +#else +#ifndef ONE_ALPHA + y[i] = accum128[0] * alpha; +#else + y[i] = accum128[0]; +#endif +#endif + } + } + } + + return 0; +}