Merge pull request #109 from xianyi/develop

rebase
This commit is contained in:
Martin Kroeker 2020-10-31 22:33:52 +01:00 committed by GitHub
commit 9efc3f0815
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
95 changed files with 6343 additions and 4147 deletions

View File

@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5)
project(OpenBLAS C ASM) project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3) set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 11.dev) set(OpenBLAS_PATCH_VERSION 12.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
# Adhere to GNU filesystem layout conventions # Adhere to GNU filesystem layout conventions

View File

@ -1,9 +1,36 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.3.12
24-Oct-2020
common:
* Fixed missing BLAS/LAPACK functions (inadvertently dropped during
the build system restructuring)
* Fixed argument conversion macro in LAPACKE_zgesvdq (LAPACK #458)
POWER:
* Added optimized SCOPY/CCOPY kernels for POWER10
* Increased and unified the default size of the GEMM BUFFER
* Fixed building for POWER10 in DYNAMIC_ARCH mode
* POWER10 compatibility test now checks binutils version as well
* Cleaned up compiler warnings
x86_64:
* corrected compiler version checks for AVX2 compatibility
* added compiler option -mavx2 for building with flang
* fixed direct SGEMM pathway for small matrix sizes (broken by
the code refactoring in 0.3.11)
* fixed unhandled partial register clobbers in several kernels
for AXPY,DOT,GEMV_N and GEMV_T flagged by gcc10 tree-vectorizer
ARMV8:
* improved Apple Vortex support to include cross-compiling
==================================================================== ====================================================================
Version 0.3.11 Version 0.3.11
17-Oct-2020 17-Oct-2020
common: common:
* API change: * API change:
the newly added BFLOAT16 functions were renamed to use the the newly added BFLOAT16 functions were renamed to use the
letter "B" instead of "H" to avoid potential confusion with letter "B" instead of "H" to avoid potential confusion with
@ -28,7 +55,7 @@ Version 0.3.11
* Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as * Makefile builds no longer misread NO_CBLAS=0 or NO_LAPACK=0 as
enabling these options enabling these options
* Fixed detection of gfortran when invoked through an mpi wrapper * Fixed detection of gfortran when invoked through an mpi wrapper
* Improve thread reinitialization performance with OpenMP xafter a fork * Improve thread reinitialization performance with OpenMP after a fork
* Added support for building only the subset of the library required * Added support for building only the subset of the library required
for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE for a particular precision by specifying BUILD_SINGLE, BUILD_DOUBLE
* Optional function name prefixes and suffixes are now correctly * Optional function name prefixes and suffixes are now correctly
@ -66,7 +93,6 @@ ARMV8:
* Fixed cpu detection on BSD-like systems * Fixed cpu detection on BSD-like systems
* Fixed compilation in -std=C18 mode * Fixed compilation in -std=C18 mode
IBM Z: IBM Z:
* Added support for compiling with the clang compiler * Added support for compiling with the clang compiler
* Improved GEMM performance on Z14 * Improved GEMM performance on Z14

View File

@ -12,3 +12,8 @@ ifeq ($(CORE), ARMV6)
CCOMMON_OPT += -mfpu=vfp CCOMMON_OPT += -mfpu=vfp
FCOMMON_OPT += -mfpu=vfp FCOMMON_OPT += -mfpu=vfp
endif endif
ifdef HAVE_NEON
CCOMMON_OPT += -mfpu=neon
FCOMMON_OPT += -mfpu=neon
endif

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.3.11.dev VERSION = 0.3.12.dev
# If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
# and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
@ -295,10 +295,13 @@ COMMON_PROF = -pg
# the below is not yet configurable, use cmake if you need to build only select types # By default the library contains BLAS functions (and LAPACK if selected) for all input types.
BUILD_SINGLE = 1 # To build a smaller library supporting e.g. only single precision real (SGEMM etc.) or only
BUILD_DOUBLE = 1 # the functions for complex numbers, uncomment the desired type(s) below
BUILD_COMPLEX = 1 # BUILD_SINGLE = 1
BUILD_COMPLEX16 = 1 # BUILD_DOUBLE = 1
# BUILD_COMPLEX = 1
# BUILD_COMPLEX16 = 1
#
# End of user configuration # End of user configuration
# #

View File

@ -319,6 +319,7 @@ ifeq ($(GCCVERSIONGTEQ7),1)
else else
GCCDUMPVERSION_PARAM := -dumpversion GCCDUMPVERSION_PARAM := -dumpversion
endif endif
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 1)
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2) GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 2)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7) GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) $(GCCDUMPVERSION_PARAM) | cut -f2 -d.` \>= 7)
endif endif
@ -855,7 +856,7 @@ CCOMMON_OPT += -DF_INTERFACE_FLANG
FCOMMON_OPT += -Mrecursive -Kieee FCOMMON_OPT += -Mrecursive -Kieee
ifeq ($(OSNAME), Linux) ifeq ($(OSNAME), Linux)
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
FLANG_VENDOR := $(shell expr `$(FC) --version|cut -f 1 -d "."|head -1`) FLANG_VENDOR := $(shell `$(FC) --version|cut -f 1 -d "."|head -1`)
ifeq ($(FLANG_VENDOR),AOCC) ifeq ($(FLANG_VENDOR),AOCC)
FCOMMON_OPT += -fno-unroll-loops FCOMMON_OPT += -fno-unroll-loops
endif endif

View File

@ -47,8 +47,6 @@ ifndef DYNAMIC_ARCH
ifndef NO_AVX512 ifndef NO_AVX512
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
# cooperlake support was added in 10.1 # cooperlake support was added in 10.1
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
GCCMINORVERSIONGTEQ1 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 1)
ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11) ifeq ($(GCCVERSIONGTEQ10)$(GCCMINORVERSIONGTEQ1), 11)
CCOMMON_OPT += -march=cooperlake CCOMMON_OPT += -march=cooperlake
FCOMMON_OPT += -march=cooperlake FCOMMON_OPT += -march=cooperlake
@ -73,10 +71,7 @@ ifndef DYNAMIC_ARCH
ifndef NO_AVX2 ifndef NO_AVX2
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0 # AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
CCOMMON_OPT += -mavx2 CCOMMON_OPT += -mavx2
endif endif

View File

@ -25,125 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AMAX #undef AMAX
#ifdef COMPLEX #ifdef COMPLEX
#ifdef DOUBLE #ifdef DOUBLE
#define AMAX BLASFUNC(dzamax) #define AMAX BLASFUNC(dzamax)
#else #else
#define AMAX BLASFUNC(scamax) #define AMAX BLASFUNC(scamax)
#endif #endif
#else #else
#ifdef DOUBLE #ifdef DOUBLE
#define AMAX BLASFUNC(damax) #define AMAX BLASFUNC(damax)
#else #else
#define AMAX BLASFUNC(samax) #define AMAX BLASFUNC(samax)
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__) int main(int argc, char *argv[])
{
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
blasint m, i; blasint m, i;
blasint inc_x=1; blasint inc_x = 1;
int loops = 1; int loops = 1;
int l; int l;
char *p; char *p;
int from = 1;
int to = 200;
int step = 1;
int from = 1; double time1, timeg;
int to = 200;
int step = 1;
struct timeval start, stop; argc--;
double time1,timeg; argv++;
argc--;argv++; if (argc > 0)
{
from = atol(*argv);
argc--;
argv++;
}
if (argc > 0)
{
to = MAX(atol(*argv), from);
argc--;
argv++;
}
if (argc > 0)
{
step = atol(*argv);
argc--;
argv++;
}
if (argc > 0) { from = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_LOOPS")))
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} loops = atoi(p);
if (argc > 0) { step = atol(*argv); argc--; argv++;} if ((p = getenv("OPENBLAS_INCX")))
inc_x = atoi(p);
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
{
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ fprintf(stderr, "Out of Memory!!\n");
fprintf(stderr,"Out of Memory!!\n");exit(1); exit(1);
} }
#ifdef __linux #ifdef __linux
@ -152,37 +100,31 @@ int main(int argc, char *argv[]){
fprintf(stderr, " SIZE Flops\n"); fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step) for (m = from; m <= to; m += step)
{ {
timeg=0; timeg = 0;
fprintf(stderr, " %6d : ", (int)m);
fprintf(stderr, " %6d : ", (int)m); for (l = 0; l < loops; l++)
{
for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
{
x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
}
for (l=0; l<loops; l++) begin();
{ AMAX(&m, x, &inc_x);
end();
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ timeg += getsec();
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0);
AMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
} }
timeg /= loops; timeg /= loops;
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops %10.6f sec\n", " %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
} }
return 0; return 0;

View File

@ -25,124 +25,73 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AMIN #undef AMIN
#ifdef COMPLEX #ifdef COMPLEX
#ifdef DOUBLE #ifdef DOUBLE
#define AMIN BLASFUNC(dzamin) #define AMIN BLASFUNC(dzamin)
#else #else
#define AMIN BLASFUNC(scamin) #define AMIN BLASFUNC(scamin)
#endif #endif
#else #else
#ifdef DOUBLE #ifdef DOUBLE
#define AMIN BLASFUNC(damin) #define AMIN BLASFUNC(damin)
#else #else
#define AMIN BLASFUNC(samin) #define AMIN BLASFUNC(samin)
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__) int main(int argc, char *argv[])
{
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
blasint m, i; blasint m, i;
blasint inc_x=1; blasint inc_x = 1;
int loops = 1; int loops = 1;
int l; int l;
char *p; char *p;
int from = 1; int from = 1;
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop; double time1, timeg;
double time1,timeg;
argc--;argv++; argc--;
argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0)
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} {
if (argc > 0) { step = atol(*argv); argc--; argv++;} from = atol(*argv);
argc--;
argv++;
}
if (argc > 0)
{
to = MAX(atol(*argv), from);
argc--;
argv++;
}
if (argc > 0)
{
step = atol(*argv);
argc--;
argv++;
}
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); if ((p = getenv("OPENBLAS_LOOPS")))
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX")))
inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
fprintf(stderr,"Out of Memory!!\n");exit(1); {
fprintf(stderr, "Out of Memory!!\n");
exit(1);
} }
#ifdef __linux #ifdef __linux
@ -151,39 +100,35 @@ int main(int argc, char *argv[]){
fprintf(stderr, " SIZE Flops\n"); fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step) for (m = from; m <= to; m += step)
{ {
timeg=0; timeg = 0;
fprintf(stderr, " %6d : ", (int)m); fprintf(stderr, " %6d : ", (int)m);
for (l = 0; l < loops; l++)
{
for (l=0; l<loops; l++) for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
{ {
x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
}
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){ begin();
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
gettimeofday( &start, (struct timezone *)0); AMIN(&m, x, &inc_x);
AMIN (&m, x, &inc_x); end();
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
timeg += getsec();
} }
timeg /= loops; timeg /= loops;
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops %10.6f sec\n", " %10.2f MFlops %10.6f sec\n",
COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg); COMPSIZE * sizeof(FLOAT) * 1. * (double)m / timeg * 1.e-6, timeg);
} }
return 0; return 0;

View File

@ -25,132 +25,74 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef ASUM #undef ASUM
#ifdef COMPLEX #ifdef COMPLEX
#ifdef DOUBLE #ifdef DOUBLE
#define ASUM BLASFUNC(dzasum) #define ASUM BLASFUNC(dzasum)
#else #else
#define ASUM BLASFUNC(scasum) #define ASUM BLASFUNC(scasum)
#endif #endif
#else #else
#ifdef DOUBLE #ifdef DOUBLE
#define ASUM BLASFUNC(dasum) #define ASUM BLASFUNC(dasum)
#else #else
#define ASUM BLASFUNC(sasum) #define ASUM BLASFUNC(sasum)
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__) int main(int argc, char *argv[])
{
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
FLOAT result; FLOAT result;
blasint m, i; blasint m, i;
blasint inc_x=1; blasint inc_x = 1;
int loops = 1; int loops = 1;
int l; int l;
char *p; char *p;
int from = 1; int from = 1;
int to = 200; int to = 200;
int step = 1; int step = 1;
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
struct timeval start, stop;
double time1,timeg;
#else
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg; double time1, timeg;
#endif
argc--;argv++; argc--;
argv++;
if (argc > 0) { from = atol(*argv); argc--; argv++;} if (argc > 0)
if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} {
if (argc > 0) { step = atol(*argv); argc--; argv++;} from = atol(*argv);
argc--;
if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); argv++;
if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); }
if (argc > 0)
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step,inc_x,loops); {
to = MAX(atol(*argv), from);
if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ argc--;
fprintf(stderr,"Out of Memory!!\n");exit(1); argv++;
}
if (argc > 0)
{
step = atol(*argv);
argc--;
argv++;
} }
if ((p = getenv("OPENBLAS_LOOPS")))
loops = atoi(p);
if ((p = getenv("OPENBLAS_INCX")))
inc_x = atoi(p);
fprintf(stderr, "From : %3d To : %3d Step = %3d Inc_x = %d Loops = %d\n", from, to, step, inc_x, loops);
if ((x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL)
{
fprintf(stderr, "Out of Memory!!\n");
exit(1);
}
#ifdef __linux #ifdef __linux
srandom(getpid()); srandom(getpid());
@ -158,45 +100,33 @@ int main(int argc, char *argv[]){
fprintf(stderr, " SIZE Flops\n"); fprintf(stderr, " SIZE Flops\n");
for(m = from; m <= to; m += step) for (m = from; m <= to; m += step)
{ {
timeg=0; timeg = 0;
fprintf(stderr, " %6d : ", (int)m); fprintf(stderr, " %6d : ", (int)m);
for (l=0; l<loops; l++) for (l = 0; l < loops; l++)
{ {
for(i = 0; i < m * COMPSIZE * abs(inc_x); i++){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
}
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
gettimeofday( &start, (struct timezone *)0);
#else
clock_gettime(CLOCK_REALTIME, &start);
#endif
result = ASUM (&m, x, &inc_x);
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
clock_gettime(CLOCK_REALTIME, &stop);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
#else
gettimeofday( &stop, (struct timezone *)0);
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9;
#endif
timeg += time1;
for (i = 0; i < m * COMPSIZE * abs(inc_x); i++)
{
x[i] = ((FLOAT)rand() / (FLOAT)RAND_MAX) - 0.5;
}
begin();
result = ASUM(&m, x, &inc_x);
end();
timeg += getsec();
} }
if (loops >1) if (loops > 1)
timeg /= loops; timeg /= loops;
#ifdef COMPLEX #ifdef COMPLEX
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg); fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 4. * (double)m / timeg * 1.e-6, timeg);
#else #else
fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg); fprintf(stderr, " %10.2f MFlops %10.6f sec\n", 2. * (double)m / timeg * 1.e-6, timeg);
#endif #endif
} }
return 0; return 0;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AXPBY #undef AXPBY
@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y; FLOAT *x, *y;
@ -129,7 +58,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -176,16 +104,10 @@ int main(int argc, char *argv[]){
for (l=0; l<loops; l++) for (l=0; l<loops; l++)
{ {
gettimeofday( &start, (struct timezone *)0); begin();
AXPBY (&m, alpha, x, &inc_x, beta, y, &inc_y ); AXPBY (&m, alpha, x, &inc_x, beta, y, &inc_y );
end();
gettimeofday( &stop, (struct timezone *)0); timeg += getsec();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
} }
timeg /= loops; timeg /= loops;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef AXPY #undef AXPY
@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y; FLOAT *x, *y;
@ -127,8 +56,6 @@ int main(int argc, char *argv[]){
int from = 1; int from = 1;
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timespec start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -175,13 +102,13 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
clock_gettime( CLOCK_REALTIME, &start); begin();
AXPY (&m, alpha, x, &inc_x, y, &inc_y ); AXPY (&m, alpha, x, &inc_x, y, &inc_y );
clock_gettime( CLOCK_REALTIME, &stop); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9; time1 = getsec();
timeg += time1; timeg += time1;

104
benchmark/bench.h Normal file
View File

@ -0,0 +1,104 @@
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
struct timeval start, stop;
#else
struct timespec start = { 0, 0 }, stop = { 0, 0 };
#endif
double getsec()
{
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
#else
return (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) * 1.e-9;
#endif
}
void begin() {
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
gettimeofday( &start, (struct timezone *)0);
#else
clock_gettime(CLOCK_REALTIME, &start);
#endif
}
void end() {
#if defined(__WIN32__) || defined(__WIN64__) || !defined(_POSIX_TIMERS)
gettimeofday( &stop, (struct timezone *)0);
#else
clock_gettime(CLOCK_REALTIME, &stop);
#endif
}

View File

@ -36,12 +36,7 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
double fabs(double); double fabs(double);
@ -71,41 +66,6 @@ double fabs(double);
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
static __inline double getmflops(int ratio, int m, double secs){ static __inline double getmflops(int ratio, int m, double secs){
double mm = (double)m; double mm = (double)m;
@ -145,7 +105,6 @@ int main(int argc, char *argv[]){
FLOAT maxerr; FLOAT maxerr;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -220,20 +179,19 @@ int main(int argc, char *argv[]){
SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
gettimeofday( &start, (struct timezone *)0); begin();
POTRF(uplo[uplos], &m, b, &m, &info); POTRF(uplo[uplos], &m, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0); end();
if (info != 0) { if (info != 0) {
fprintf(stderr, "Info = %d\n", info); fprintf(stderr, "Info = %d\n", info);
exit(1); exit(1);
} }
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
maxerr = 0.;
if (!(uplos & 1)) { if (!(uplos & 1)) {
for (j = 0; j < m; j++) { for (j = 0; j < m; j++) {

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef COPY #undef COPY
@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y; FLOAT *x, *y;
@ -128,11 +57,9 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1 = 0.0, timeg = 0.0; double time1 = 0.0, timeg = 0.0;
long nanos = 0; long nanos = 0;
time_t seconds = 0; time_t seconds = 0;
struct timespec time_start = { 0, 0 }, time_end = { 0, 0 };
argc--;argv++; argc--;argv++;
@ -176,15 +103,10 @@ int main(int argc, char *argv[]){
for (l=0; l<loops; l++) for (l=0; l<loops; l++)
{ {
clock_gettime(CLOCK_REALTIME, &time_start); begin();
COPY (&m, x, &inc_x, y, &inc_y ); COPY (&m, x, &inc_x, y, &inc_y );
clock_gettime(CLOCK_REALTIME, &time_end); end();
timeg += getsec();
nanos = time_end.tv_nsec - time_start.tv_nsec;
seconds = time_end.tv_sec - time_start.tv_sec;
time1 = seconds + nanos / 1.e9;
timeg += time1;
} }
timeg /= loops; timeg /= loops;

View File

@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT #undef DOT
#ifdef DOUBLE #ifdef DOUBLE
#define DOT BLASFUNC(ddot) #define DOT BLASFUNC(ddot)
#else #else
#define DOT BLASFUNC(sdot) #define DOT BLASFUNC(sdot)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y; FLOAT *x, *y;
@ -122,7 +49,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -169,15 +95,12 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
result = DOT (&m, x, &inc_x, y, &inc_y ); result = DOT (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
timeg += getsec();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
} }

View File

@ -36,13 +36,7 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GEEV #undef GEEV
@ -74,71 +68,6 @@ extern void GEEV( char* jobvl, char* jobvr, blasint* n, FLOAT* a,
FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info ); FLOAT* vr, blasint* ldvr, FLOAT* work, blasint* lwork, FLOAT *rwork, blasint* info );
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork; FLOAT *a,*vl,*vr,*wi,*wr,*work,*rwork;
@ -154,7 +83,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -223,7 +151,7 @@ int main(int argc, char *argv[]){
for(m = from; m <= to; m += step){ for(m = from; m <= to; m += step){
fprintf(stderr, " %6d : ", (int)m); fprintf(stderr, " %6d : ", (int)m);
gettimeofday( &start, (struct timezone *)0); begin();
lwork = -1; lwork = -1;
#ifndef COMPLEX #ifndef COMPLEX
@ -239,14 +167,14 @@ int main(int argc, char *argv[]){
GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info); GEEV (&job, &jobr, &m, a, &m, wr, vl, &m, vr, &m, work, &lwork,rwork, &info);
#endif #endif
gettimeofday( &stop, (struct timezone *)0); end();
if (info) { if (info) {
fprintf(stderr, "failed to compute eigenvalues .. %d\n", info); fprintf(stderr, "failed to compute eigenvalues .. %d\n", info);
exit(1); exit(1);
} }
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops : %10.2f Sec : %d\n", " %10.2f MFlops : %10.2f Sec : %d\n",

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GEMM #undef GEMM
@ -55,71 +49,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
IFLOAT *a, *b; IFLOAT *a, *b;
@ -139,7 +68,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1, timeg; double time1, timeg;
argc--;argv++; argc--;argv++;
@ -228,14 +156,14 @@ int main(int argc, char *argv[]){
ldc = m; ldc = m;
fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k); fprintf(stderr, " M=%4d, N=%4d, K=%4d : ", (int)m, (int)n, (int)k);
gettimeofday( &start, (struct timezone *)0); begin();
for (j=0; j<loops; j++) { for (j=0; j<loops; j++) {
GEMM (&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc); GEMM (&transa, &transb, &m, &n, &k, alpha, a, &lda, b, &ldb, beta, c, &ldc);
} }
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg = time1/loops; timeg = time1/loops;
fprintf(stderr, fprintf(stderr,

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GEMM #undef GEMM
@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *b, *c; FLOAT *a, *b, *c;
@ -133,7 +62,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -187,16 +115,12 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); GEMM (&trans, &trans, &m, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0); end();
timeg += getsec();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1;
} }
timeg /= loops; timeg /= loops;

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GEMV #undef GEMV
@ -52,72 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -211,10 +139,10 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;
} }
@ -248,10 +176,10 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;
} }

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GER #undef GER
@ -49,72 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
@ -131,7 +59,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -198,16 +125,13 @@ int main(int argc, char *argv[]){
for (l=0; l<loops; l++) for (l=0; l<loops; l++)
{ {
gettimeofday( &start, (struct timezone *)0); begin();
GER (&m, &n, alpha, x, &inc_x, y, &inc_y, a , &m); GER (&m, &n, alpha, x, &inc_x, y, &inc_y, a , &m);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; timeg += getsec();
timeg += time1;
} }
timeg /= loops; timeg /= loops;

View File

@ -36,12 +36,7 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
double fabs(double); double fabs(double);
@ -66,71 +61,6 @@ double fabs(double);
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *b; FLOAT *a, *b;
@ -142,7 +72,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -194,22 +123,18 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
GESV (&m, &m, a, &m, ipiv, b, &m, &info); GESV (&m, &m, a, &m, ipiv, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
time1 = getsec();
fprintf(stderr, fprintf(stderr,
"%10.2f MFlops %10.6f s\n", "%10.2f MFlops %10.6f s\n",
COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1); COMPSIZE * COMPSIZE * (2. / 3. * (double)m * (double)m * (double)m + 2. * (double)m * (double)m * (double)m ) / (time1) * 1.e-6 , time1);
} }
return 0; return 0;

View File

@ -36,12 +36,7 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef GETRF #undef GETRF
#undef GETRI #undef GETRI
@ -72,71 +67,6 @@
extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info); extern void GETRI(blasint *m, FLOAT *a, blasint *lda, blasint *ipiv, FLOAT *work, blasint *lwork, blasint *info);
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a,*work; FLOAT *a,*work;
@ -148,7 +78,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -205,21 +134,21 @@ int main(int argc, char *argv[]){
exit(1); exit(1);
} }
gettimeofday( &start, (struct timezone *)0); begin();
lwork = -1; lwork = -1;
GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info); GETRI(&m, a, &m, ipiv, wkopt, &lwork, &info);
lwork = (blasint)wkopt[0]; lwork = (blasint)wkopt[0];
GETRI(&m, a, &m, ipiv, work, &lwork, &info); GETRI(&m, a, &m, ipiv, work, &lwork, &info);
gettimeofday( &stop, (struct timezone *)0); end();
if (info) { if (info) {
fprintf(stderr, "failed compute inverse matrix .. %d\n", info); fprintf(stderr, "failed compute inverse matrix .. %d\n", info);
exit(1); exit(1);
} }
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops : %10.2f Sec : %d\n", " %10.2f MFlops : %10.2f Sec : %d\n",

View File

@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HBMV #undef HBMV
#ifdef DOUBLE #ifdef DOUBLE
#define HBMV BLASFUNC(zhbmv) #define HBMV BLASFUNC(zhbmv)
#else #else
#define HBMV BLASFUNC(chbmv) #define HBMV BLASFUNC(chbmv)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz) {
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size) {
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
@ -125,7 +52,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -186,15 +112,13 @@ int main(int argc, char *argv[]){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); HBMV (&uplo, &m, &k, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; timeg += getsec();
timeg += time1;
} }

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HEMM #undef HEMM
@ -41,72 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HEMM BLASFUNC(chemm) #define HEMM BLASFUNC(chemm)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *b, *c; FLOAT *a, *b, *c;
@ -126,7 +54,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -170,13 +97,13 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); HEMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",

View File

@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HEMV #undef HEMV
#ifdef DOUBLE #ifdef DOUBLE
#define HEMV BLASFUNC(zhemv) #define HEMV BLASFUNC(zhemv)
#else #else
#define HEMV BLASFUNC(chemv) #define HEMV BLASFUNC(chemv)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
@ -124,7 +51,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -182,13 +108,13 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); HEMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HER #undef HER
#ifdef DOUBLE #ifdef DOUBLE
#define HER BLASFUNC(zher) #define HER BLASFUNC(zher)
#else #else
#define HER BLASFUNC(cher) #define HER BLASFUNC(cher)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x; FLOAT *a, *x;
@ -126,8 +53,6 @@ int main(int argc, char *argv[]){
int from = 1; int from = 1;
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -166,15 +91,13 @@ int main(int argc, char *argv[]){
x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
HER (&uplo, &m, alpha, x, &incx, a, &m ); HER (&uplo, &m, alpha, x, &incx, a, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",

View File

@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HER2 #undef HER2
#ifdef DOUBLE #ifdef DOUBLE
#define HER2 BLASFUNC(zher2) #define HER2 BLASFUNC(zher2)
#else #else
#define HER2 BLASFUNC(cher2) #define HER2 BLASFUNC(cher2)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
@ -127,7 +54,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -169,16 +95,13 @@ int main(int argc, char *argv[]){
y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[ (long)j * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m ); HER2 (&uplo, &m, alpha, x, &inc, y, &inc, a, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
gettimeofday( &start, (struct timezone *)0);
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HER2K #undef HER2K
#ifdef DOUBLE #ifdef DOUBLE
@ -40,72 +34,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HER2K BLASFUNC(cher2k) #define HER2K BLASFUNC(cher2k)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *b, *c; FLOAT *a, *b, *c;
@ -125,7 +53,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -169,13 +96,13 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); HER2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",

View File

@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HERK #undef HERK
#ifdef DOUBLE #ifdef DOUBLE
#define HERK BLASFUNC(zherk) #define HERK BLASFUNC(zherk)
#else #else
#define HERK BLASFUNC(cherk) #define HERK BLASFUNC(cherk)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *c; FLOAT *a, *c;
@ -127,7 +54,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -167,18 +93,17 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); HERK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",
COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6); COMPSIZE * COMPSIZE * 1. * (double)m * (double)m * (double)m / time1 * 1.e-6);
} }
return 0; return 0;

View File

@ -25,89 +25,16 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef HPMV #undef HPMV
#ifdef DOUBLE #ifdef DOUBLE
#define HPMV BLASFUNC(zhpmv) #define HPMV BLASFUNC(zhpmv)
#else #else
#define HPMV BLASFUNC(chpmv) #define HPMV BLASFUNC(chpmv)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz) {
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size) {
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
@ -124,7 +51,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -183,13 +109,13 @@ int main(int argc, char *argv[]){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); HPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IAMAX #undef IAMAX
@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
IAMAX (&m, x, &inc_x); IAMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IAMIN #undef IAMIN
@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
IAMIN (&m, x, &inc_x); IAMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IMAX #undef IMAX
@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
IMAX (&m, x, &inc_x); IMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef IMIN #undef IMIN
@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
IMIN (&m, x, &inc_x); IMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -36,12 +36,7 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
double fabs(double); double fabs(double);
@ -72,71 +67,6 @@ double fabs(double);
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *b; FLOAT *a, *b;
@ -151,7 +81,6 @@ int main(int argc, char *argv[]){
FLOAT maxerr; FLOAT maxerr;
struct timeval start, stop;
double time1, time2; double time1, time2;
argc--;argv++; argc--;argv++;
@ -198,31 +127,31 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
GETRF (&m, &m, a, &m, ipiv, &info); GETRF (&m, &m, a, &m, ipiv, &info);
gettimeofday( &stop, (struct timezone *)0); end();
if (info) { if (info) {
fprintf(stderr, "Matrix is not singular .. %d\n", info); fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1); exit(1);
} }
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
gettimeofday( &start, (struct timezone *)0); begin();
GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info); GETRS("N", &m, &unit, a, &m, ipiv, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0); end();
if (info) { if (info) {
fprintf(stderr, "Matrix is not singular .. %d\n", info); fprintf(stderr, "Matrix is not singular .. %d\n", info);
exit(1); exit(1);
} }
time2 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time2 = getsec();
maxerr = 0.; maxerr = 0.;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef NAMAX #undef NAMAX
@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
NAMAX (&m, x, &inc_x); NAMAX (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef NAMIN #undef NAMIN
@ -43,71 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
@ -121,7 +50,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -160,13 +88,13 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
NAMIN (&m, x, &inc_x); NAMIN (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef NRM2 #undef NRM2
@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x; FLOAT *x;
@ -127,7 +56,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -166,13 +94,13 @@ int main(int argc, char *argv[]){
x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
NRM2 (&m, x, &inc_x); NRM2 (&m, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -36,12 +36,7 @@
/* or implied, of The University of Texas at Austin. */ /* or implied, of The University of Texas at Austin. */
/*********************************************************************/ /*********************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
double fabs(double); double fabs(double);
@ -86,37 +81,7 @@ double fabs(double);
// extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info); // extern void POTRI(char *uplo, blasint *m, FLOAT *a, blasint *lda, blasint *info);
// extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info); // extern void POTRS(char *uplo, blasint *m, blasint *n, FLOAT *a, blasint *lda, FLOAT *b, blasint *ldb, blasint *info);
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
@ -141,7 +106,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -217,18 +181,18 @@ int main(int argc, char *argv[]){
SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m); SYRK(uplo[uplos], trans[uplos], &m, &m, alpha, a, &m, beta, b, &m);
gettimeofday( &start, (struct timezone *)0); begin();
POTRF(uplo[uplos], &m, b, &m, &info); POTRF(uplo[uplos], &m, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0); end();
if (info != 0) { if (info != 0) {
fprintf(stderr, "Potrf info = %d\n", info); fprintf(stderr, "Potrf info = %d\n", info);
exit(1); exit(1);
} }
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6; flops = COMPSIZE * COMPSIZE * (1.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 1.0/6.0* (double)m) / time1 * 1.e-6;
if ( btest == 'S' ) if ( btest == 'S' )
@ -240,17 +204,17 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
POTRS(uplo[uplos], &m, &m, b, &m, a, &m, &info); POTRS(uplo[uplos], &m, &m, b, &m, a, &m, &info);
gettimeofday( &stop, (struct timezone *)0); end();
if (info != 0) { if (info != 0) {
fprintf(stderr, "Potrs info = %d\n", info); fprintf(stderr, "Potrs info = %d\n", info);
exit(1); exit(1);
} }
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6; flops = COMPSIZE * COMPSIZE * (2.0 * (double)m * (double)m *(double)m ) / time1 * 1.e-6;
} }
@ -258,18 +222,18 @@ int main(int argc, char *argv[]){
if ( btest == 'I' ) if ( btest == 'I' )
{ {
gettimeofday( &start, (struct timezone *)0); begin();
POTRI(uplo[uplos], &m, b, &m, &info); POTRI(uplo[uplos], &m, b, &m, &info);
gettimeofday( &stop, (struct timezone *)0); end();
if (info != 0) { if (info != 0) {
fprintf(stderr, "Potri info = %d\n", info); fprintf(stderr, "Potri info = %d\n", info);
exit(1); exit(1);
} }
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6; flops = COMPSIZE * COMPSIZE * (2.0/3.0 * (double)m * (double)m *(double)m +1.0/2.0* (double)m *(double)m + 5.0/6.0* (double)m) / time1 * 1.e-6;
} }

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef ROT #undef ROT
@ -52,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y; FLOAT *x, *y;
@ -133,7 +63,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -179,13 +108,13 @@ int main(int argc, char *argv[]){
for (l=0; l<loops; l++) for (l=0; l<loops; l++)
{ {
gettimeofday( &start, (struct timezone *)0); begin();
ROT (&m, x, &inc_x, y, &inc_y, c, s); ROT (&m, x, &inc_x, y, &inc_y, c, s);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,12 +25,7 @@ LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF
THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef ROTM #undef ROTM
@ -40,72 +35,6 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ROTM BLASFUNC(srotm) #define ROTM BLASFUNC(srotm)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz)
{
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv) {
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =
shmget(IPC_PRIVATE, (size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT | 0600)) < 0) {
printf("Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf("Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
@ -122,7 +51,7 @@ int main(int argc, char *argv[])
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1, timeg; double time1, timeg;
argc--; argc--;
@ -188,14 +117,13 @@ int main(int argc, char *argv[])
} }
for (l = 0; l < loops; l++) { for (l = 0; l < loops; l++) {
gettimeofday(&start, (struct timezone *)0); begin();
ROTM(&m, x, &inc_x, y, &inc_y, param); ROTM(&m, x, &inc_x, y, &inc_y, param);
gettimeofday(&stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + time1 = getsec();
(double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
timeg += time1; timeg += time1;
} }

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SCAL #undef SCAL
@ -49,71 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y; FLOAT *x, *y;
@ -128,7 +57,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -174,13 +102,13 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
SCAL (&m, alpha, x, &inc_x); SCAL (&m, alpha, x, &inc_x);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,17 +25,10 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SPMV #undef SPMV
#ifndef COMPLEX #ifndef COMPLEX
#ifdef DOUBLE #ifdef DOUBLE
@ -54,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
@ -135,7 +63,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -193,13 +120,13 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y ); SPMV (&uplo, &m, alpha, a, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SPR #undef SPR
@ -41,73 +35,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SPR BLASFUNC(sspr) #define SPR BLASFUNC(sspr)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a,*c; FLOAT *a,*c;
@ -129,7 +56,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -173,13 +99,13 @@ int main(int argc, char *argv[]){
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
SPR (&uplo, &m, alpha, c, &inc_x, a); SPR (&uplo, &m, alpha, c, &inc_x, a);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;
} }

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SPR2 #undef SPR2
@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a,*b,*c; FLOAT *a,*b,*c;
@ -129,7 +58,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -182,13 +110,13 @@ int main(int argc, char *argv[]){
c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; c[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a); SPR2 (&uplo, &m, alpha, c, &inc_x, b, &inc_y, a);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;
} }

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SWAP #undef SWAP
@ -49,71 +44,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y; FLOAT *x, *y;
@ -128,7 +58,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -175,13 +104,13 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
SWAP (&m, x, &inc_x, y, &inc_y ); SWAP (&m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYMM #undef SYMM
@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *b, *c; FLOAT *a, *b, *c;
@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -181,13 +109,13 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); SYMM (&side, &uplo, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYMV #undef SYMV
@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x, *y; FLOAT *a, *x, *y;
@ -134,7 +63,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -192,13 +120,13 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); SYMV (&uplo, &m, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYR #undef SYR
@ -42,72 +37,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x,*a; FLOAT *x,*a;
@ -124,7 +53,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -165,13 +93,13 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
SYR (&uplo, &m, alpha, x, &inc_x, a, &m ); SYR (&uplo, &m, alpha, x, &inc_x, a, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYR2 #undef SYR2
@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYR2 BLASFUNC(ssyr2) #define SYR2 BLASFUNC(ssyr2)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y, *a; FLOAT *x, *y, *a;
@ -125,7 +53,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -174,13 +101,13 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m ); SYR2 (&uplo, &m, alpha, x, &inc_x, y, &inc_y, a, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYR2K #undef SYR2K
@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *b, *c; FLOAT *a, *b, *c;
@ -137,7 +67,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -181,13 +110,13 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m ); SYR2K (&uplo, &trans, &m, &m, alpha, a, &m, b, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef SYRK #undef SYRK
@ -53,71 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *c; FLOAT *a, *c;
@ -137,7 +66,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -177,13 +105,13 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m ); SYRK (&uplo, &trans, &m, &m, alpha, a, &m, beta, c, &m );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops\n", " %10.2f MFlops\n",

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TPMV #undef TPMV
@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
@ -112,7 +73,6 @@ int main(int argc, char *argv[])
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg; double time1, timeg;
argc--;argv++; argc--;argv++;
@ -153,11 +113,11 @@ int main(int argc, char *argv[])
} }
for (l = 0; l < loops; l++) { for (l = 0; l < loops; l++) {
clock_gettime(CLOCK_REALTIME, &start); begin();
TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x); TPMV (&uplo, &trans, &diag, &n, a, x, &inc_x);
clock_gettime(CLOCK_REALTIME, &stop); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; time1 = getsec();
timeg += time1; timeg += time1;
} }

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TPSV #undef TPSV
@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
@ -112,7 +73,6 @@ int main(int argc, char *argv[])
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg; double time1, timeg;
argc--;argv++; argc--;argv++;
@ -153,11 +113,11 @@ int main(int argc, char *argv[])
} }
for (l = 0; l < loops; l++) { for (l = 0; l < loops; l++) {
clock_gettime(CLOCK_REALTIME, &start); begin();
TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x); TPSV (&uplo, &trans, &diag, &n, a, x, &inc_x);
clock_gettime(CLOCK_REALTIME, &stop); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; time1 = getsec();
timeg += time1; timeg += time1;
} }

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TRMM #undef TRMM
@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *b; FLOAT *a, *b;
@ -141,7 +71,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -180,13 +109,13 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m); TRMM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
fprintf(stderr, fprintf(stderr,
" %10.2f MFlops %10.6f sec\n", " %10.2f MFlops %10.6f sec\n",

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TRMV #undef TRMV
@ -52,40 +47,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size)
{
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1) {
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]) int main(int argc, char *argv[])
{ {
@ -112,7 +73,6 @@ int main(int argc, char *argv[])
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timespec start = { 0, 0 }, stop = { 0, 0 };
double time1, timeg; double time1, timeg;
argc--;argv++; argc--;argv++;
@ -153,11 +113,11 @@ int main(int argc, char *argv[])
} }
for (l = 0; l < loops; l++) { for (l = 0; l < loops; l++) {
clock_gettime(CLOCK_REALTIME, &start); begin();
TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x); TRMV (&uplo, &trans, &diag, &n, a, &n, x, &inc_x);
clock_gettime(CLOCK_REALTIME, &stop); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_nsec - start.tv_nsec)) / 1.e9; time1 = getsec();
timeg += time1; timeg += time1;
} }

View File

@ -25,12 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef TRSM #undef TRSM
@ -53,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *b; FLOAT *a, *b;
@ -151,7 +81,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1; double time1;
argc--;argv++; argc--;argv++;
@ -196,13 +125,13 @@ int main(int argc, char *argv[]){
} }
} }
gettimeofday( &start, (struct timezone *)0); begin();
TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m); TRSM (&side, &uplo, &trans, &diag, &m, &m, alpha, a, &m, b, &m);
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;
} }

View File

@ -25,14 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include <time.h>
#include "common.h"
#undef GEMV #undef GEMV
#undef TRSV #undef TRSV
@ -55,71 +48,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *a, *x; FLOAT *a, *x;
@ -133,7 +61,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timespec time_start, time_end;
time_t seconds = 0; time_t seconds = 0;
double time1,timeg; double time1,timeg;
@ -189,19 +116,13 @@ int main(int argc, char *argv[]){
for(l =0;l< loops;l++){ for(l =0;l< loops;l++){
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_start); begin();
TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x); TRSV(&uplo,&transa,&diag,&n,a,&n,x,&inc_x);
end();
clock_gettime(CLOCK_PROCESS_CPUTIME_ID,&time_end); time1 = getsec();
nanos = time_end.tv_nsec - time_start.tv_nsec;
seconds = time_end.tv_sec - time_start.tv_sec;
time1 = seconds + nanos /1.e9;
timeg += time1; timeg += time1;
} }
timeg /= loops; timeg /= loops;
long long muls = n*(n+1)/2.0; long long muls = n*(n+1)/2.0;
long long adds = (n - 1.0)*n/2.0; long long adds = (n - 1.0)*n/2.0;

View File

@ -25,90 +25,18 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#define RETURN_BY_STACK 1
#include "common.h"
#define RETURN_BY_STACK 1
#undef DOT #undef DOT
#ifdef DOUBLE #ifdef DOUBLE
#define DOT BLASFUNC(zdotu) #define DOT BLASFUNC(zdotu)
#else #else
#define DOT BLASFUNC(cdotu) #define DOT BLASFUNC(cdotu)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y; FLOAT *x, *y;
@ -123,7 +51,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -170,13 +97,13 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
DOT (&result, &m, x, &inc_x, y, &inc_y ); DOT (&result, &m, x, &inc_x, y, &inc_y );
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -25,13 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#include <stdio.h> #include "bench.h"
#include <stdlib.h>
#ifdef __CYGWIN32__
#include <sys/time.h>
#endif
#include "common.h"
#undef DOT #undef DOT
@ -42,72 +36,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define DOT BLASFUNC(cdotu) #define DOT BLASFUNC(cdotu)
#endif #endif
#if defined(__WIN32__) || defined(__WIN64__)
#ifndef DELTA_EPOCH_IN_MICROSECS
#define DELTA_EPOCH_IN_MICROSECS 11644473600000000ULL
#endif
int gettimeofday(struct timeval *tv, void *tz){
FILETIME ft;
unsigned __int64 tmpres = 0;
static int tzflag;
if (NULL != tv)
{
GetSystemTimeAsFileTime(&ft);
tmpres |= ft.dwHighDateTime;
tmpres <<= 32;
tmpres |= ft.dwLowDateTime;
/*converting file time to unix epoch*/
tmpres /= 10; /*convert into microseconds*/
tmpres -= DELTA_EPOCH_IN_MICROSECS;
tv->tv_sec = (long)(tmpres / 1000000UL);
tv->tv_usec = (long)(tmpres % 1000000UL);
}
return 0;
}
#endif
#if !defined(__WIN32__) && !defined(__WIN64__) && !defined(__CYGWIN32__) && 0
static void *huge_malloc(BLASLONG size){
int shmid;
void *address;
#ifndef SHM_HUGETLB
#define SHM_HUGETLB 04000
#endif
if ((shmid =shmget(IPC_PRIVATE,
(size + HUGE_PAGESIZE) & ~(HUGE_PAGESIZE - 1),
SHM_HUGETLB | IPC_CREAT |0600)) < 0) {
printf( "Memory allocation failed(shmget).\n");
exit(1);
}
address = shmat(shmid, NULL, SHM_RND);
if ((BLASLONG)address == -1){
printf( "Memory allocation failed(shmat).\n");
exit(1);
}
shmctl(shmid, IPC_RMID, 0);
return address;
}
#define malloc huge_malloc
#endif
int main(int argc, char *argv[]){ int main(int argc, char *argv[]){
FLOAT *x, *y; FLOAT *x, *y;
@ -122,7 +50,6 @@ int main(int argc, char *argv[]){
int to = 200; int to = 200;
int step = 1; int step = 1;
struct timeval start, stop;
double time1,timeg; double time1,timeg;
argc--;argv++; argc--;argv++;
@ -169,15 +96,15 @@ int main(int argc, char *argv[]){
for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){ for(i = 0; i < m * COMPSIZE * abs(inc_y); i++){
y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
} }
gettimeofday( &start, (struct timezone *)0); begin();
#ifdef RETURN_BY_STACK #ifdef RETURN_BY_STACK
DOT (&result , &m, x, &inc_x, y, &inc_y ); DOT (&result , &m, x, &inc_x, y, &inc_y );
#else #else
result = DOT (&m, x, &inc_x, y, &inc_y ); result = DOT (&m, x, &inc_x, y, &inc_y );
#endif #endif
gettimeofday( &stop, (struct timezone *)0); end();
time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; time1 = getsec();
timeg += time1; timeg += time1;

View File

@ -393,6 +393,7 @@ void cblas_sbf16tos(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPE
void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout); void cblas_dbf16tod(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *in, OPENBLAS_CONST blasint incin, double *out, OPENBLAS_CONST blasint incout);
/* dot production of BFLOAT16 input arrays, and output as float */ /* dot production of BFLOAT16 input arrays, and output as float */
float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy); float cblas_sbdot(OPENBLAS_CONST blasint n, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST bfloat16 *y, OPENBLAS_CONST blasint incy);
void cblas_sbgemv(OPENBLAS_CONST enum CBLAS_ORDER order, OPENBLAS_CONST enum CBLAS_TRANSPOSE trans, OPENBLAS_CONST blasint m, OPENBLAS_CONST blasint n, OPENBLAS_CONST float alpha, OPENBLAS_CONST bfloat16 *a, OPENBLAS_CONST blasint lda, OPENBLAS_CONST bfloat16 *x, OPENBLAS_CONST blasint incx, OPENBLAS_CONST float beta, float *y, OPENBLAS_CONST blasint incy);
#ifdef __cplusplus #ifdef __cplusplus
} }

View File

@ -184,8 +184,8 @@ macro(SetDefaultL2)
set(XHEMV_V_KERNEL ../generic/zhemv_k.c) set(XHEMV_V_KERNEL ../generic/zhemv_k.c)
set(XHEMV_M_KERNEL ../generic/zhemv_k.c) set(XHEMV_M_KERNEL ../generic/zhemv_k.c)
if (BUILD_BFLOAT16) if (BUILD_BFLOAT16)
set(SBGEMVNKERNEL ../arm/gemv_n.c) set(SBGEMVNKERNEL ../x86_64/sbgemv_n.c)
set(SBGEMVTKERNEL ../arm/gemv_t.c) set(SBGEMVTKERNEL ../x86_64/sbgemv_t.c)
set(SHGERKERNEL ../generic/ger.c) set(SHGERKERNEL ../generic/ger.c)
endif () endif ()
endmacro () endmacro ()

View File

@ -250,6 +250,8 @@ void BLASFUNC(xgeru)(blasint *, blasint *, xdouble *, xdouble *, blasint *,
void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *, void BLASFUNC(xgerc)(blasint *, blasint *, xdouble *, xdouble *, blasint *,
xdouble *, blasint *, xdouble *, blasint *); xdouble *, blasint *, xdouble *, blasint *);
void BLASFUNC(sbgemv)(char *, blasint *, blasint *, float *, bfloat16 *, blasint *,
bfloat16 *, blasint *, float *, float *, blasint *);
void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *, void BLASFUNC(sgemv)(char *, blasint *, blasint *, float *, float *, blasint *,
float *, blasint *, float *, float *, blasint *); float *, blasint *, float *, float *, blasint *);
void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *, void BLASFUNC(dgemv)(char *, blasint *, blasint *, double *, double *, blasint *,

View File

@ -44,6 +44,10 @@
extern "C" { extern "C" {
#endif #endif
int sbgemv_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
int sbgemv_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
int sbgemv_thread_n(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
int sbgemv_thread_t(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG, int);
int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int sger_k (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *); int dger_k (BLASLONG, BLASLONG, BLASLONG, double, double *, BLASLONG, double *, BLASLONG, double *, BLASLONG, double *);
int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *); int qger_k (BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *, BLASLONG, xdouble *);

View File

@ -646,10 +646,12 @@
#elif defined(BFLOAT16) #elif defined(BFLOAT16)
#define D_TO_BF16_K SBDTOBF16_K #define D_TO_BF16_K SBDTOBF16_K
#define D_BF16_TO_K DBF16TOD_K #define D_BF16_TO_K DBF16TOD_K
#define S_TO_BF16_K SBSTOBF16_K #define S_TO_BF16_K SBSTOBF16_K
#define S_BF16_TO_K SBF16TOS_K #define S_BF16_TO_K SBF16TOS_K
#define SBGEMV_N SBGEMV_N_K
#define SBGEMV_T SBGEMV_T_K
#define AMAX_K SAMAX_K #define AMAX_K SAMAX_K
#define AMIN_K SAMIN_K #define AMIN_K SAMIN_K

View File

@ -78,8 +78,8 @@ BLASLONG (*isbmin_k) (BLASLONG, float *, BLASLONG);
int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbscal_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG); int (*sbswap_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG);
int (*sbgemv_n) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbgemv_n) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
int (*sbgemv_t) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbgemv_t) (BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 *, BLASLONG, float, float *, BLASLONG);
int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbger_k) (BLASLONG, BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);
int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *); int (*sbsymv_L) (BLASLONG, BLASLONG, float, float *, BLASLONG, float *, BLASLONG, float *, BLASLONG, float *);

View File

@ -8,6 +8,8 @@
#define SBDTOBF16_K sbdtobf16_k #define SBDTOBF16_K sbdtobf16_k
#define SBF16TOS_K sbf16tos_k #define SBF16TOS_K sbf16tos_k
#define DBF16TOD_K dbf16tod_k #define DBF16TOD_K dbf16tod_k
#define SBGEMV_N_K sbgemv_n
#define SBGEMV_T_K sbgemv_t
#define SBGEMM_ONCOPY sbgemm_oncopy #define SBGEMM_ONCOPY sbgemm_oncopy
#define SBGEMM_OTCOPY sbgemm_otcopy #define SBGEMM_OTCOPY sbgemm_otcopy
@ -29,6 +31,8 @@
#define SBDTOBF16_K gotoblas -> sbdtobf16_k #define SBDTOBF16_K gotoblas -> sbdtobf16_k
#define SBF16TOS_K gotoblas -> sbf16tos_k #define SBF16TOS_K gotoblas -> sbf16tos_k
#define DBF16TOD_K gotoblas -> dbf16tod_k #define DBF16TOD_K gotoblas -> dbf16tod_k
#define SBGEMV_N_K gotoblas -> sbgemv_n
#define SBGEMV_T_K gotoblas -> sbgemv_t
#define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy #define SBGEMM_ONCOPY gotoblas -> sbgemm_oncopy
#define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy #define SBGEMM_OTCOPY gotoblas -> sbgemm_otcopy

View File

@ -202,7 +202,7 @@ int support_avx(){
if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){ if ((ecx & (1 << 28)) != 0 && (ecx & (1 << 27)) != 0 && (ecx & (1 << 26)) != 0){
xgetbv(0, &eax, &edx); xgetbv(0, &eax, &edx);
if((eax & 6) == 6){ if((eax & 6) == 6){
ret=1; //OS support AVX ret=1; //OS supports saving xmm and ymm registers (6 = (1<<1) | (1<<2))
} }
} }
return ret; return ret;
@ -219,8 +219,8 @@ int support_avx2(){
if (!support_avx()) if (!support_avx())
return 0; return 0;
cpuid(7, &eax, &ebx, &ecx, &edx); cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0) if((ebx & (1<<5)) != 0)
ret=1; //OS supports AVX2 ret=1; //CPU supports AVX2
return ret; return ret;
#else #else
return 0; return 0;
@ -235,14 +235,14 @@ int support_avx512(){
if (!support_avx()) if (!support_avx())
return 0; return 0;
cpuid(7, &eax, &ebx, &ecx, &edx); cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & 32) != 32){ if((ebx & (1<<5)) == 0){
ret=0; //OS does not even support AVX2 ret=0; //cpu does not have avx2 flag
} }
if((ebx & (1<<31)) != 0){ if((ebx & (1<<31)) != 0){ //AVX512VL flag
xgetbv(0, &eax, &edx); xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0) if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL ret=1; //OS supports saving zmm registers
} }
return ret; return ret;
#else #else
return 0; return 0;

View File

@ -413,7 +413,13 @@ XBLASOBJS += \
xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUN.$(SUFFIX) \ xtbmv_thread_RUU.$(SUFFIX) xtbmv_thread_RUN.$(SUFFIX) \
xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLN.$(SUFFIX) \ xtbmv_thread_RLU.$(SUFFIX) xtbmv_thread_RLN.$(SUFFIX) \
xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \ xtbmv_thread_CUU.$(SUFFIX) xtbmv_thread_CUN.$(SUFFIX) \
xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX) \ xtbmv_thread_CLU.$(SUFFIX) xtbmv_thread_CLN.$(SUFFIX)
ifeq ($(BUILD_BFLOAT16),1)
SBBLASOBJS += \
sbgemv_thread_n$(TSUFFIX).$(SUFFIX) \
sbgemv_thread_t$(TSUFFIX).$(SUFFIX)
endif
endif endif
@ -3693,4 +3699,12 @@ xtrsv_CUU.$(SUFFIX) xtrsv_CUU.$(PSUFFIX) : ztrsv_L.c ../../param.h
xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h xtrsv_CUN.$(SUFFIX) xtrsv_CUN.$(PSUFFIX) : ztrsv_L.c ../../param.h
$(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F) $(CC) -c $(CFLAGS) -DXDOUBLE -DCOMPLEX -DTRANSA=4 -UUNIT $< -o $(@F)
ifeq ($(BUILD_BFLOAT16),1)
sbgemv_thread_n.$(SUFFIX) sbgemv_thread_n.$(PSUFFIX) : sbgemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -UTRANSA -UCONJ -UXCONJ $< -o $(@F)
sbgemv_thread_t.$(SUFFIX) sbgemv_thread_t.$(PSUFFIX) : sbgemv_thread.c ../../common.h
$(CC) -c $(CFLAGS) -UCOMPLEX -UDOUBLE -DTRANSA -UCONJ -UXCONJ $< -o $(@F)
endif
include ../../Makefile.tail include ../../Makefile.tail

View File

@ -0,0 +1,149 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
#ifndef TRANSA
#define SBGEMV SBGEMV_N
#else
#define SBGEMV SBGEMV_T
#endif
static int sbgemv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *dummy1, FLOAT *dummy2, BLASLONG dummy3){
bfloat16 *a, *x;
float *y;
BLASLONG lda, incx, incy;
BLASLONG m_from, m_to, n_from, n_to;
a = (bfloat16 *)args->a;
x = (bfloat16 *)args->b;
y = (float *)args->c;
lda = args->lda;
incx = args->ldb;
incy = args->ldc;
#ifndef TRANSA // N
m_from = *(range_m + 0);
m_to = *(range_m + 1);
n_from = 0;
n_to = args -> n;
a += m_from;
y += m_from * incy;
#else // T
m_from = 0;
m_to = args->m;
n_from = *(range_n + 0);
n_to = *(range_n + 1);
a += n_from * lda;
y += n_from * incy;
#endif
SBGEMV(m_to - m_from, n_to - n_from, *((FLOAT *)(args->alpha)), a, lda, x, incx, *((FLOAT *)(args->beta)), y, incy);
return 0;
}
int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float *y, BLASLONG incy, int threads)
{
blas_arg_t args;
blas_queue_t queue[MAX_CPU_NUMBER];
BLASLONG range[MAX_CPU_NUMBER + 1];
#ifndef TRANSA
BLASLONG width_for_split = m;
#else
BLASLONG width_for_split = n;
#endif
BLASLONG BLOCK_WIDTH = width_for_split/threads;
int mode = BLAS_BFLOAT16 | BLAS_REAL;
args.m = m;
args.n = n;
args.a = (void *)a;
args.b = (void *)x;
args.c = (void *)y;
args.lda = lda;
args.ldb = incx;
args.ldc = incy;
args.alpha = (void *)&alpha;
args.beta = (void *)&beta;
range[0] = 0;
int thread_idx;
for (thread_idx=0; thread_idx<threads; thread_idx++) {
if (thread_idx != threads-1) {
range[thread_idx + 1] = range[thread_idx] + BLOCK_WIDTH;
} else {
range[thread_idx + 1] = range[thread_idx] + width_for_split;
}
queue[thread_idx].mode = mode;
queue[thread_idx].routine = sbgemv_kernel;
queue[thread_idx].args = &args;
#ifndef TRANSA
queue[thread_idx].range_m = &range[thread_idx];
queue[thread_idx].range_n = NULL;
#else
queue[thread_idx].range_m = NULL;
queue[thread_idx].range_n = &range[thread_idx];
#endif
queue[thread_idx].sa = NULL;
queue[thread_idx].sb = NULL;
queue[thread_idx].next = &queue[thread_idx + 1];
width_for_split -= BLOCK_WIDTH;
}
if (thread_idx) {
queue[0].sa = NULL;
queue[0].sb = NULL;
queue[thread_idx - 1].next = NULL;
exec_blas(thread_idx, queue);
}
return 0;
}

View File

@ -352,7 +352,6 @@ fprintf(stderr,"UNHANDLED COMPLEX\n");
/* Other types in future */ /* Other types in future */
} }
} }
if (!sb) fprintf(stderr,"SB not declared!!!\n");
queue->sb=sb; queue->sb=sb;
} }
} }

View File

@ -330,8 +330,8 @@ int support_avx2(){
if (!support_avx()) if (!support_avx())
return 0; return 0;
cpuid(7, &eax, &ebx, &ecx, &edx); cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) != 0) if((ebx & (1<<5)) != 0)
ret=1; //OS supports AVX2 ret=1; //AVX2 flag is set
return ret; return ret;
#else #else
return 0; return 0;
@ -346,13 +346,13 @@ int support_avx512(){
if (!support_avx()) if (!support_avx())
return 0; return 0;
cpuid(7, &eax, &ebx, &ecx, &edx); cpuid(7, &eax, &ebx, &ecx, &edx);
if((ebx & (1<<7)) == 0){ if((ebx & (1<<5)) == 0){
ret=0; //OS does not even support AVX2 ret=0; //cpu does not have avx2 flag
} }
if((ebx & (1u<<31)) != 0){ if((ebx & (1<<31)) != 0){ //AVX512VL flag is set
xgetbv(0, &eax, &edx); xgetbv(0, &eax, &edx);
if((eax & 0xe0) == 0xe0) if((eax & 0xe0) == 0xe0)
ret=1; //OS supports AVX512VL ret=1; //OS supports saving zmm register
} }
return ret; return ret;
#else #else

View File

@ -139,19 +139,30 @@ static gotoblas_t *force_coretype(char *coretype) {
static gotoblas_t *get_coretype(void) { static gotoblas_t *get_coretype(void) {
int implementer, variant, part, arch, revision, midr_el1; int implementer, variant, part, arch, revision, midr_el1;
char coremsg[128];
#if (!defined OS_LINUX && !defined OS_ANDROID)
return NULL;
#endif
#if (defined OS_LINUX || defined OS_ANDROID)
if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) { if (!(getauxval(AT_HWCAP) & HWCAP_CPUID)) {
char coremsg[128]; #ifdef __linux
FILE *infile;
char buffer[512], *p, *cpu_part = NULL, *cpu_implementer = NULL;
p = (char *) NULL ;
infile = fopen("/sys/devices/system/cpu/cpu0/regs/identification/midr_el1","r");
if (!infile) return NULL;
fgets(buffer, sizeof(buffer), infile);
midr_el1=strtoul(buffer,NULL,16);
fclose(infile);
#else
snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n"); snprintf(coremsg, 128, "Kernel lacks cpuid feature support. Auto detection of core type failed !!!\n");
openblas_warning(1, coremsg); openblas_warning(1, coremsg);
return NULL; return NULL;
}
#else
return NULL;
#endif #endif
} else {
get_cpu_ftr(MIDR_EL1, midr_el1); get_cpu_ftr(MIDR_EL1, midr_el1);
}
/* /*
* MIDR_EL1 * MIDR_EL1
* *
@ -219,6 +230,9 @@ static gotoblas_t *get_coretype(void) {
return &gotoblas_FALKOR; return &gotoblas_FALKOR;
} }
break; break;
default:
snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
openblas_warning(1, coremsg);
} }
return NULL; return NULL;
} }

View File

@ -1767,11 +1767,11 @@ int get_num_procs(void);
int get_num_procs(void) { int get_num_procs(void) {
static int nums = 0; static int nums = 0;
#if defined(__GLIBC_PREREQ)
cpu_set_t cpuset,*cpusetp; cpu_set_t cpuset,*cpusetp;
size_t size; size_t size;
int ret; int ret;
#if defined(__GLIBC_PREREQ)
#if !__GLIBC_PREREQ(2, 7) #if !__GLIBC_PREREQ(2, 7)
int i; int i;
#if !__GLIBC_PREREQ(2, 6) #if !__GLIBC_PREREQ(2, 6)

View File

@ -51,7 +51,7 @@
zgeadd, dzsum); zgeadd, dzsum);
@blasobjs = (lsame, xerbla); @blasobjs = (lsame, xerbla);
@bfblasobjs = (sbgemm, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod); @bfblasobjs = (sbgemm, sbgemv, sbdot, sbstobf16, sbdtobf16, sbf16tos, dbf16tod);
@cblasobjsc = ( @cblasobjsc = (
cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv, cblas_caxpy, cblas_ccopy, cblas_cdotc, cblas_cdotu, cblas_cgbmv, cblas_cgemm, cblas_cgemv,
cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k, cblas_cgerc, cblas_cgeru, cblas_chbmv, cblas_chemm, cblas_chemv, cblas_cher2, cblas_cher2k,
@ -94,7 +94,7 @@
@cblasobjs = ( cblas_xerbla ); @cblasobjs = ( cblas_xerbla );
@bfcblasobjs = (cblas_sbgemm, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod); @bfcblasobjs = (cblas_sbgemm, cblas_sbgemv, cblas_sbdot, cblas_sbstobf16, cblas_sbdtobf16, cblas_sbf16tos, cblas_dbf16tod);
@exblasobjs = ( @exblasobjs = (
qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm, qamax,qamin,qasum,qaxpy,qcabs1,qcopy,qdot,qgbmv,qgemm,
@ -786,22 +786,22 @@ zpotri,
zlamswlq, zlamswlq,
zgemlq, zgemlq,
); );
@lapackobjs2 = (@lapackobjs2, @lapackobjs2s = (@lapackobjs2s,
sladiv1, sladiv1);
dladiv1, @lapackobjs2d = (@lapackobjs2d,
dladiv1);
@lapackobjs = (@lapackobjs,
iparam2stage, iparam2stage,
# functions added for lapack-3.8.0 # functions added for lapack-3.8.0
ilaenv2stage, ilaenv2stage,
); );
# functions added for lapack-3.9.0 # functions added for lapack-3.9.0
@lapackobjs2c = (@lapackobjs2c, @lapackobjs2c = (@lapackobjs2c,
cgesvdq, cgesvdq,
cungtsqr, cungtsqr
dcombssq,
); );
@lapackobjs2d = (@lapackobjs2d, @lapackobjs2d = (@lapackobjs2d,
dcombssq,
dgesvdq, dgesvdq,
dorgtsqr, dorgtsqr,
); );

View File

@ -1405,8 +1405,41 @@ int main(int argc, char *argv[]){
printf("NUM_CORES=%d\n", get_num_cores()); printf("NUM_CORES=%d\n", get_num_cores());
#if defined(__arm__) && !defined(FORCE) #if defined(__arm__)
#if !defined(FORCE)
fprintf(stderr,"get features!\n");
get_features(); get_features();
#else
fprintf(stderr,"split archconfig!\n");
sprintf(buffer, "%s", ARCHCONFIG);
p = &buffer[0];
while (*p) {
if ((*p == '-') && (*(p + 1) == 'D')) {
p += 2;
if (*p != 'H') {
while( (*p != ' ') && (*p != '-') && (*p != '\0') && (*p != '\n')) {p++; }
if (*p == '-') continue;
}
while ((*p != ' ') && (*p != '\0')) {
if (*p == '=') {
printf("=");
p ++;
while ((*p != ' ') && (*p != '\0')) {
printf("%c", *p);
p ++;
}
} else {
printf("%c", *p);
p ++;
if ((*p == ' ') || (*p =='\0')) printf("=1\n");
}
}
} else p ++;
}
#endif
#endif #endif

View File

@ -48,6 +48,7 @@ SBLAS3OBJS = \
ifeq ($(BUILD_BFLOAT16),1) ifeq ($(BUILD_BFLOAT16),1)
SBBLAS1OBJS = sbdot.$(SUFFIX) SBBLAS1OBJS = sbdot.$(SUFFIX)
SBBLAS2OBJS = sbgemv.$(SUFFIX)
SBBLAS3OBJS = sbgemm.$(SUFFIX) SBBLAS3OBJS = sbgemm.$(SUFFIX)
SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX) SBEXTOBJS = sbstobf16.$(SUFFIX) sbdtobf16.$(SUFFIX) sbf16tos.$(SUFFIX) dbf16tod.$(SUFFIX)
endif endif
@ -284,6 +285,7 @@ CSBLAS3OBJS = \
ifeq ($(BUILD_BFLOAT16),1) ifeq ($(BUILD_BFLOAT16),1)
CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX) CSBBLAS1OBJS = cblas_sbdot.$(SUFFIX)
CSBBLAS2OBJS = cblas_sbgemv.$(SUFFIX)
CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX) CSBBLAS3OBJS = cblas_sbgemm.$(SUFFIX)
CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX) CSBEXTOBJS = cblas_sbstobf16.$(SUFFIX) cblas_sbdtobf16.$(SUFFIX) cblas_sbf16tos.$(SUFFIX) cblas_dbf16tod.$(SUFFIX)
endif endif
@ -382,6 +384,7 @@ SBLAS1OBJS += $(CSBLAS1OBJS)
SBLAS2OBJS += $(CSBLAS2OBJS) SBLAS2OBJS += $(CSBLAS2OBJS)
SBLAS3OBJS += $(CSBLAS3OBJS) SBLAS3OBJS += $(CSBLAS3OBJS)
SBBLAS1OBJS += $(CSBBLAS1OBJS) SBBLAS1OBJS += $(CSBBLAS1OBJS)
SBBLAS2OBJS += $(CSBBLAS2OBJS)
SBBLAS3OBJS += $(CSBBLAS3OBJS) SBBLAS3OBJS += $(CSBBLAS3OBJS)
DBLAS1OBJS += $(CDBLAS1OBJS) DBLAS1OBJS += $(CDBLAS1OBJS)
DBLAS2OBJS += $(CDBLAS2OBJS) DBLAS2OBJS += $(CDBLAS2OBJS)
@ -399,7 +402,7 @@ CBAUXOBJS += $(CXERBLAOBJ)
endif endif
SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS) SBLASOBJS = $(SBLAS1OBJS) $(SBLAS2OBJS) $(SBLAS3OBJS)
SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS3OBJS) SBBLASOBJS = $(SBBLAS1OBJS) $(SBBLAS2OBJS) $(SBBLAS3OBJS)
DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS) DBLASOBJS = $(DBLAS1OBJS) $(DBLAS2OBJS) $(DBLAS3OBJS)
QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS) QBLASOBJS = $(QBLAS1OBJS) $(QBLAS2OBJS) $(QBLAS3OBJS)
CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS) CBLASOBJS = $(CBLAS1OBJS) $(CBLAS2OBJS) $(CBLAS3OBJS)
@ -507,7 +510,7 @@ ifneq ($(BUILD_COMPLEX16),1)
endif endif
FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS) FUNCOBJS = $(SBEXTOBJS) $(CXERBLAOBJS) $(SBBLASOBJS) $(SBLASOBJS) $(DBLASOBJS) $(CBLASOBJS) $(ZBLASOBJS)
$(info FUNCOBJS = {[$(FUNCOBJS)]} )
ifdef EXPRECISION ifdef EXPRECISION
FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS) FUNCOBJS += $(QBLASOBJS) $(XBLASOBJS)
endif endif
@ -538,7 +541,7 @@ clean ::
level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS) level1 : $(SBEXTOBJS) $(SBBLAS1OBJS) $(SBLAS1OBJS) $(DBLAS1OBJS) $(QBLAS1OBJS) $(CBLAS1OBJS) $(ZBLAS1OBJS) $(XBLAS1OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
level2 : $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS) level2 : $(SBBLAS2OBJS) $(SBLAS2OBJS) $(DBLAS2OBJS) $(QBLAS2OBJS) $(CBLAS2OBJS) $(ZBLAS2OBJS) $(XBLAS2OBJS)
$(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^ $(AR) $(ARFLAGS) -ru $(TOPDIR)/$(LIBNAME) $^
level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS) level3 : $(SBBLAS3OBJS) $(SBLAS3OBJS) $(DBLAS3OBJS) $(QBLAS3OBJS) $(CBLAS3OBJS) $(ZBLAS3OBJS) $(XBLAS3OBJS)
@ -929,6 +932,11 @@ xgeru.$(SUFFIX) xgeru.$(PSUFFIX) : zger.c
xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c xgerc.$(SUFFIX) xgerc.$(PSUFFIX) : zger.c
$(CC) -c $(CFLAGS) -DCONJ $< -o $(@F) $(CC) -c $(CFLAGS) -DCONJ $< -o $(@F)
ifeq ($(BUILD_BFLOAT16),1)
sbgemv.$(SUFFIX) sbgemv.$(PSUFFIX) : sbgemv.c
$(CC) $(CFLAGS) -c $< -o $(@F)
endif
ifndef USE_NETLIB_GEMV ifndef USE_NETLIB_GEMV
sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c sgemv.$(SUFFIX) sgemv.$(PSUFFIX): gemv.c
$(CC) -c $(CFLAGS) -o $(@F) $< $(CC) -c $(CFLAGS) -o $(@F) $<
@ -1656,6 +1664,11 @@ cblas_csscal.$(SUFFIX) cblas_csscal.$(PSUFFIX) : zscal.c
cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c cblas_zdscal.$(SUFFIX) cblas_zdscal.$(PSUFFIX) : zscal.c
$(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F) $(CC) $(CFLAGS) -DCBLAS -c -DSSCAL $< -o $(@F)
ifeq ($(BUILD_BFLOAT16),1)
cblas_sbgemv.$(SUFFIX) cblas_sbgemv.$(PSUFFIX) : sbgemv.c
$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
endif
cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c cblas_sgemv.$(SUFFIX) cblas_sgemv.$(PSUFFIX): gemv.c
$(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $< $(CC) -DCBLAS -c $(CFLAGS) -o $(@F) $<

View File

@ -191,7 +191,6 @@ void CNAME(enum CBLAS_ORDER order,
} }
#endif #endif
//printf("m=%d, n=%d, trans=%d, incx=%d, incy=%d, alpha=%f, beta=%f\n", m, n, trans, incx, incy, alpha, beta);
if ((m==0) || (n==0)) return; if ((m==0) || (n==0)) return;
lenx = n; lenx = n;

210
interface/sbgemv.c Normal file
View File

@ -0,0 +1,210 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include "common.h"
#include "l1param.h"
#ifdef FUNCTION_PROFILE
#include "functable.h"
#endif
#define ERROR_NAME "SBGEMV "
#ifdef SMP
static int (*sbgemv_thread[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG, int) = {
sbgemv_thread_n, sbgemv_thread_t,
};
#endif
#ifndef CBLAS
void NAME(char *TRANS, blasint *M, blasint *N, float *ALPHA, bfloat16 *a, blasint *LDA, bfloat16 *x, blasint *INCX, float *BETA, float *y, blasint *INCY)
{
char trans = *TRANS;
blasint m = *M;
blasint n = *N;
blasint lda = *LDA;
blasint incx = *INCX;
blasint incy = *INCY;
float alpha = *ALPHA;
float beta = *BETA;
#ifdef SMP
int nthreads;
#endif
int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
SBGEMV_N, SBGEMV_T,
};
blasint info;
blasint lenx, leny;
blasint i;
PRINT_DEBUG_NAME;
TOUPPER(trans);
info = 0;
i = -1;
if (trans == 'N') {i = 0;}
if (trans == 'T') {i = 1;}
if (trans == 'R') {i = 0;}
if (trans == 'C') {i = 1;}
if (incy == 0) {info = 11;}
if (incx == 0) {info = 8;}
if (lda < MAX(1, m)) {info = 6;}
if (n < 0) {info = 3;}
if (m < 0) {info = 2;}
if (i < 0) {info = 1;}
trans = i;
if (info != 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#else
void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint m, blasint n, float alpha, bfloat16 *a, blasint lda, bfloat16 *x, blasint incx, float beta, float *y, blasint incy)
{
blasint lenx, leny;
int trans;
blasint info, t;
#ifdef SMP
int nthreads;
#endif
int (*sbgemv[])(BLASLONG, BLASLONG, float, bfloat16 *, BLASLONG, bfloat16 * , BLASLONG, float, float *, BLASLONG) = {
SBGEMV_N, SBGEMV_T,
};
PRINT_DEBUG_CNAME;
trans = -1;
info = 0;
if (order == CblasColMajor) { // Column Major
if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
trans = 0;
} else if (TransA == CblasTrans || TransA == CblasConjTrans) {
trans = 1;
}
} else { // Row Major
if (TransA == CblasNoTrans || TransA == CblasConjNoTrans) {
trans = 1;
} else if (TransA == CblasTrans || TransA == CblasConjTrans) {
trans = 0;
}
t = n;
n = m;
m = t;
}
info = -1;
if (incy == 0) {info = 11;}
if (incx == 0) {info = 8;}
if (lda < MAX(1, m)) {info = 6;}
if (n < 0) {info = 3;}
if (m < 0) {info = 2;}
if (trans < 0) {info = 1;}
if (info >= 0) {
BLASFUNC(xerbla)(ERROR_NAME, &info, sizeof(ERROR_NAME));
return;
}
#endif
if ((m==0) || (n==0)) return;
if (trans) {
lenx = m;
leny = n;
} else {
lenx = n;
leny = m;
}
if (alpha == ZERO) {
if (beta != ONE) SCAL_K(leny, 0, 0, beta, y, blasabs(incy), NULL, 0, NULL, 0);
return;
}
IDEBUG_START;
FUNCTION_PROFILE_START();
if (incx < 0) {x -= (lenx - 1) * incx;}
if (incy < 0) {y -= (leny - 1) * incy;}
#ifdef SMP
int thread_thres_row = 20480;
if (trans) {
if (n <= thread_thres_row) {
nthreads = 1;
} else {
nthreads = num_cpu_avail(1);
}
} else {
if (m <= thread_thres_row) {
nthreads = 1;
} else {
nthreads = num_cpu_avail(1);
}
}
if (nthreads == 1) {
#endif
(sbgemv[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy);
#ifdef SMP
} else {
(sbgemv_thread[(int)trans])(m, n, alpha, a, lda, x, incx, beta, y, incy, nthreads);
}
#endif
FUNCTION_PROFILE_END(1, m * n + m + n, 2 * m * n);
IDEBUG_END;
return;
}

View File

@ -12,11 +12,6 @@ ifdef HAVE_SSSE3
CFLAGS += -mssse3 CFLAGS += -mssse3
endif endif
ifeq ($(C_COMPILER), GCC)
GCCVERSIONGTEQ9 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 9)
GCCVERSIONGTEQ10 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 10)
endif
ifeq ($(ARCH), power) ifeq ($(ARCH), power)
ifeq ($(C_COMPILER), CLANG) ifeq ($(C_COMPILER), CLANG)
override CFLAGS += -fno-integrated-as override CFLAGS += -fno-integrated-as
@ -26,20 +21,14 @@ endif
AVX2OPT = AVX2OPT =
ifeq ($(C_COMPILER), GCC) ifeq ($(C_COMPILER), GCC)
# AVX2 support was added in 4.7.0 # AVX2 support was added in 4.7.0
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
GCCMINORVERSIONGTEQ7 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 7)
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
AVX2OPT = -mavx2 AVX2OPT = -mavx2
endif endif
endif endif
ifeq ($(C_COMPILER), CLANG) ifeq ($(C_COMPILER), CLANG)
# Any clang posing as gcc 4.2 should be new enough (3.4 or later) # Any clang posing as gcc 4.2 should be new enough (3.4 or later)
GCCVERSIONGTEQ4 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 4) GCCVERSIONCHECK := $(GCCVERSIONGT4)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ2)
GCCVERSIONGTEQ5 := $(shell expr `$(CC) -dumpversion | cut -f1 -d.` \>= 5)
GCCMINORVERSIONGTEQ2 := $(shell expr `$(CC) -dumpversion | cut -f2 -d.` \>= 2)
GCCVERSIONCHECK := $(GCCVERSIONGTEQ5)$(GCCVERSIONGTEQ4)$(GCCMINORVERSIONGTEQ7)
ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111)) ifeq ($(GCCVERSIONCHECK), $(filter $(GCCVERSIONCHECK), 011 110 111))
AVX2OPT = -mavx2 AVX2OPT = -mavx2
endif endif

View File

@ -48,6 +48,16 @@ ifndef XGEMVTKERNEL
XGEMVTKERNEL = zgemv_t.S XGEMVTKERNEL = zgemv_t.S
endif endif
ifeq ($(BUILD_BFLOAT16),1)
ifndef SBGEMVNKERNEL
SBGEMVNKERNEL = ../x86_64/sbgemv_n.c
endif
ifndef SBGEMVTKERNEL
SBGEMVTKERNEL = ../x86_64/sbgemv_t.c
endif
endif
### GER ### ### GER ###
ifndef SGERKERNEL ifndef SGERKERNEL
@ -234,6 +244,12 @@ XBLASOBJS += \
xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \ xhemv_U$(TSUFFIX).$(SUFFIX) xhemv_L$(TSUFFIX).$(SUFFIX) xhemv_V$(TSUFFIX).$(SUFFIX) xhemv_M$(TSUFFIX).$(SUFFIX) \
xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX) xgeru_k$(TSUFFIX).$(SUFFIX) xgerc_k$(TSUFFIX).$(SUFFIX) xgerv_k$(TSUFFIX).$(SUFFIX) xgerd_k$(TSUFFIX).$(SUFFIX)
ifeq ($(BUILD_BFLOAT16),1)
SBBLASOBJS += \
sbgemv_n$(TSUFFIX).$(SUFFIX) \
sbgemv_t$(TSUFFIX).$(SUFFIX)
endif
ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" "" ifneq "$(or $(BUILD_SINGLE), $(BUILD_DOUBLE), $(BUILD_COMPLEX))" ""
$(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP) $(KDIR)sgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sgemv_n$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SGEMVNKERNEL) $(TOPDIR)/common.h $(GEMVDEP)
$(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@ $(CC) -c $(CFLAGS) -UDOUBLE -UCOMPLEX -UTRANS $< -o $@
@ -483,4 +499,10 @@ $(KDIR)xhemv_V$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_V$(TSUFFIX).$(PSUFFIX) : $(KER
$(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h $(KDIR)xhemv_M$(TSUFFIX).$(SUFFIX) $(KDIR)xhemv_M$(TSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(XHEMV_M_KERNEL) ../symcopy.h
$(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@ $(CC) -c $(CFLAGS) -DCOMPLEX -DXDOUBLE -DLOWER -DHEMV -DHEMVREV $< -o $@
ifeq ($(BUILD_BFLOAT16),1)
$(KDIR)sbgemv_n$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_n$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVNKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
$(KDIR)sbgemv_t$(TSUFFIX).$(SUFFIX) $(KDIR)sbgemv_t$(TPSUFFIX).$(PSUFFIX) : $(KERNELDIR)/$(SBGEMVTKERNEL)
$(CC) -c $(CFLAGS) -UCOMPLEX $< -o $@
endif

View File

@ -141,13 +141,9 @@ DASUMKERNEL = dasum.c
CASUMKERNEL = casum.c CASUMKERNEL = casum.c
ZASUMKERNEL = zasum.c ZASUMKERNEL = zasum.c
# #
SAXPYKERNEL = saxpy.c SAXPYKERNEL = saxpy_power10.c
DAXPYKERNEL = daxpy_power10.c DAXPYKERNEL = daxpy_power10.c
ifneq ($(GCCVERSIONGTEQ9),1) CAXPYKERNEL = caxpy_power10.c
CAXPYKERNEL = caxpy_power9.S
else
CAXPYKERNEL = caxpy.c
endif
ZAXPYKERNEL = zaxpy_power10.c ZAXPYKERNEL = zaxpy_power10.c
# #
SCOPYKERNEL = scopy_power10.c SCOPYKERNEL = scopy_power10.c

View File

@ -0,0 +1,188 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void caxpy_kernel_8 (long n, float *x, float *y,
float alpha_r, float alpha_i)
{
#if !defined(CONJ)
static const float mvec[4] = { -1.0, 1.0, -1.0, 1.0 };
#else
static const float mvec[4] = { 1.0, -1.0, 1.0, -1.0 };
#endif
const float *mvecp = mvec;
/* We have to load reverse mask for big endian. */
/* __vector unsigned char mask={ 4,5,6,7,0,1,2,3,12,13,14,15,8,9,10,11}; */
__vector unsigned char mask = { 11,10,9,8,15,14,13,12,3,2,1,0,7,6,5,4};
long ytmp;
__asm__
(
"xscvdpspn 32, %7 \n\t"
"xscvdpspn 33, %8 \n\t"
"xxspltw 32, 32, 0 \n\t"
"xxspltw 33, 33, 0 \n\t"
"lxvd2x 36, 0, %9 \n\t" // mvec
#if !defined(CONJ)
"xvmulsp 33, 33, 36 \n\t" // alpha_i * mvec
#else
"xvmulsp 32, 32, 36 \n\t" // alpha_r * mvec
#endif
"mr %4, %3 \n\t"
"dcbt 0, %2 \n\t"
"dcbt 0, %3 \n\t"
"lxvp 40, 0(%2) \n\t" // x0
"lxvp 42, 32(%2) \n\t" // x2
"lxvp 48, 0(%3) \n\t" // y0
"lxvp 50, 32(%3) \n\t" // y2
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
"xxperm 53, 41, %x10 \n\t" // exchange real and imag part
"xxperm 54, 42, %x10 \n\t" // exchange real and imag part
"xxperm 55, 43, %x10 \n\t" // exchange real and imag part
"lxvp 44, 64(%2) \n\t" // x4
"lxvp 46, 96(%2) \n\t" // x6
"lxvp 34, 64(%3) \n\t" // y4
"lxvp 38, 96(%3) \n\t" // y6
"xxperm 56, 44, %x10 \n\t" // exchange real and imag part
"xxperm 57, 45, %x10 \n\t" // exchange real and imag part
"xxperm 58, 46, %x10 \n\t" // exchange real and imag part
"xxperm 59, 47, %x10 \n\t" // exchange real and imag part
"addi %2, %2, 128 \n\t"
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddasp 49, 41, 32 \n\t"
"lxvp 40, 0(%2) \n\t" // x0
"xvmaddasp 50, 42, 32 \n\t"
"xvmaddasp 51, 43, 32 \n\t"
"lxvp 42, 32(%2) \n\t" // x2
"xvmaddasp 34, 44, 32 \n\t"
"xvmaddasp 35, 45, 32 \n\t"
"lxvp 44, 64(%2) \n\t" // x4
"xvmaddasp 38, 46, 32 \n\t"
"xvmaddasp 39, 47, 32 \n\t"
"lxvp 46, 96(%2) \n\t" // x6
"xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
"addi %2, %2, 128 \n\t"
"xvmaddasp 49, 53, 33 \n\t"
"xvmaddasp 50, 54, 33 \n\t"
"xvmaddasp 51, 55, 33 \n\t"
"xvmaddasp 34, 56, 33 \n\t"
"xvmaddasp 35, 57, 33 \n\t"
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
"stxvp 48, 0(%4) \n\t"
"stxvp 50, 32(%4) \n\t"
"stxvp 34, 64(%4) \n\t"
"stxvp 38, 96(%4) \n\t"
"addi %4, %4, 128 \n\t"
"xxperm 52, 40, %x10 \n\t" // exchange real and imag part
"xxperm 53, 41, %x10 \n\t" // exchange real and imag part
"lxvp 48, 0(%3) \n\t" // y0
"xxperm 54, 42, %x10 \n\t" // exchange real and imag part
"xxperm 55, 43, %x10 \n\t" // exchange real and imag part
"lxvp 50, 32(%3) \n\t" // y2
"xxperm 56, 44, %x10 \n\t" // exchange real and imag part
"xxperm 57, 45, %x10 \n\t" // exchange real and imag part
"lxvp 34, 64(%3) \n\t" // y4
"xxperm 58, 46, %x10 \n\t" // exchange real and imag part
"xxperm 59, 47, %x10 \n\t" // exchange real and imag part
"lxvp 38, 96(%3) \n\t" // y6
"addi %3, %3, 128 \n\t"
"addic. %1, %1, -16 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmaddasp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
"xvmaddasp 49, 41, 32 \n\t"
"xvmaddasp 50, 42, 32 \n\t"
"xvmaddasp 51, 43, 32 \n\t"
"xvmaddasp 34, 44, 32 \n\t"
"xvmaddasp 35, 45, 32 \n\t"
"xvmaddasp 38, 46, 32 \n\t"
"xvmaddasp 39, 47, 32 \n\t"
"xvmaddasp 48, 52, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
"xvmaddasp 49, 53, 33 \n\t"
"xvmaddasp 50, 54, 33 \n\t"
"xvmaddasp 51, 55, 33 \n\t"
"xvmaddasp 34, 56, 33 \n\t"
"xvmaddasp 35, 57, 33 \n\t"
"xvmaddasp 38, 58, 33 \n\t"
"xvmaddasp 39, 59, 33 \n\t"
"stxvp 48, 0(%4) \n\t"
"stxvp 50, 32(%4) \n\t"
"stxvp 34, 64(%4) \n\t"
"stxvp 38, 96(%4) \n\t"
"#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n"
:
"+m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y), // 3
"=b" (ytmp) // 4
:
"m" (*x),
"m" (*mvecp),
"d" (alpha_r), // 7
"d" (alpha_i), // 8
"4" (mvecp), // 9
"wa" (mask)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57","vs58","vs59"
);
}

View File

@ -0,0 +1,126 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "caxpy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL_8
static void caxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT da_r,FLOAT da_i)
{
BLASLONG register i = 0;
BLASLONG register ix = 0;
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] - da_i * x[ix+3] ) ;
y[ix+3] += ( da_r * x[ix+3] + da_i * x[ix+2] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
y[ix+2] += ( da_r * x[ix+2] + da_i * x[ix+3] ) ;
y[ix+3] -= ( da_r * x[ix+3] - da_i * x[ix+2] ) ;
#endif
ix+=4 ;
i+=2 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -16;
if ( n1 )
{
caxpy_kernel_8 (n1, x, y, da_r, da_i);
ix = 2 * n1;
}
i = n1;
while(i < n)
{
#if !defined(CONJ)
y[ix] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[ix+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[ix] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[ix+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
i++ ;
ix += 2;
}
return(0);
}
inc_x *=2;
inc_y *=2;
while(i < n)
{
#if !defined(CONJ)
y[iy] += ( da_r * x[ix] - da_i * x[ix+1] ) ;
y[iy+1] += ( da_r * x[ix+1] + da_i * x[ix] ) ;
#else
y[iy] += ( da_r * x[ix] + da_i * x[ix+1] ) ;
y[iy+1] -= ( da_r * x[ix+1] - da_i * x[ix] ) ;
#endif
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -0,0 +1,181 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define HAVE_KERNEL_8 1
static void saxpy_kernel_64(long n, float *x, float *y, float alpha)
{
__vector float t0 = {alpha, alpha,alpha, alpha};
__asm__
(
"dcbt 0, %2 \n\t"
"dcbt 0, %3 \n\t"
"lxvp 32, 0(%2) \n\t"
"lxvp 34, 32(%2) \n\t"
"lxvp 40, 64(%2) \n\t"
"lxvp 42, 96(%2) \n\t"
"lxvp 48, 128(%2) \n\t"
"lxvp 50, 160(%2) \n\t"
"lxvp 52, 192(%2) \n\t"
"lxvp 54, 224(%2) \n\t"
"lxvp 36, 0(%3) \n\t"
"lxvp 38, 32(%3) \n\t"
"lxvp 44, 64(%3) \n\t"
"lxvp 46, 96(%3) \n\t"
"lxvp 56, 128(%3) \n\t"
"lxvp 58, 160(%3) \n\t"
"lxvp 60, 192(%3) \n\t"
"lxvp 62, 224(%3) \n\t"
"addi %2, %2, 256 \n\t"
"addic. %1, %1, -64 \n\t"
"ble two%= \n\t"
".align 5 \n"
"one%=: \n\t"
"xvmaddasp 36, 32, %x4 \n\t"
"xvmaddasp 37, 33, %x4 \n\t"
"lxvp 32, 0(%2) \n\t"
"stxvp 36, 0(%3) \n\t"
"xvmaddasp 38, 34, %x4 \n\t"
"xvmaddasp 39, 35, %x4 \n\t"
"lxvp 34, 32(%2) \n\t"
"stxvp 38, 32(%3) \n\t"
"lxvp 36, 256(%3) \n\t"
"lxvp 38, 288(%3) \n\t"
"xvmaddasp 44, 40, %x4 \n\t"
"xvmaddasp 45, 41, %x4 \n\t"
"lxvp 40, 64(%2) \n\t"
"stxvp 44, 64(%3) \n\t"
"xvmaddasp 46, 42, %x4 \n\t"
"xvmaddasp 47, 43, %x4 \n\t"
"lxvp 42, 96(%2) \n\t"
"stxvp 46, 96(%3) \n\t"
"lxvp 44, 320(%3) \n\t"
"lxvp 46, 352(%3) \n\t"
"xvmaddasp 56, 48, %x4 \n\t"
"xvmaddasp 57, 49, %x4 \n\t"
"lxvp 48, 128(%2) \n\t"
"stxvp 56, 128(%3) \n\t"
"xvmaddasp 58, 50, %x4 \n\t"
"xvmaddasp 59, 51, %x4 \n\t"
"lxvp 50, 160(%2) \n\t"
"stxvp 58, 160(%3) \n\t"
"lxvp 56, 384(%3) \n\t"
"lxvp 58, 416(%3) \n\t"
"xvmaddasp 60, 52, %x4 \n\t"
"xvmaddasp 61, 53, %x4 \n\t"
"lxvp 52, 192(%2) \n\t"
"stxvp 60, 192(%3) \n\t"
"xvmaddasp 62, 54, %x4 \n\t"
"xvmaddasp 63, 55, %x4 \n\t"
"lxvp 54, 224(%2) \n\t"
"stxvp 62, 224(%3) \n\t"
"lxvp 60, 448(%3) \n\t"
"lxvp 62, 480(%3) \n\t"
"addi %2, %2, 256 \n\t"
"addi %3, %3, 256 \n\t"
"addic. %1, %1, -64 \n\t"
"bgt one%= \n"
"two%=: \n\t"
"xvmaddasp 36, 32, %x4 \n\t"
"xvmaddasp 37, 33, %x4 \n\t"
"xvmaddasp 38, 34, %x4 \n\t"
"xvmaddasp 39, 35, %x4 \n\t"
"xvmaddasp 44, 40, %x4 \n\t"
"xvmaddasp 45, 41, %x4 \n\t"
"xvmaddasp 46, 42, %x4 \n\t"
"xvmaddasp 47, 43, %x4 \n\t"
"xvmaddasp 56, 48, %x4 \n\t"
"xvmaddasp 57, 49, %x4 \n\t"
"xvmaddasp 58, 50, %x4 \n\t"
"xvmaddasp 59, 51, %x4 \n\t"
"xvmaddasp 60, 52, %x4 \n\t"
"xvmaddasp 61, 53, %x4 \n\t"
"xvmaddasp 62, 54, %x4 \n\t"
"xvmaddasp 63, 55, %x4 \n\t"
"stxvp 36, 0(%3) \n\t"
"stxvp 38, 32(%3) \n\t"
"stxvp 44, 64(%3) \n\t"
"stxvp 46, 96(%3) \n\t"
"stxvp 56, 128(%3) \n\t"
"stxvp 58, 160(%3) \n\t"
"stxvp 60, 192(%3) \n\t"
"stxvp 62, 224(%3) \n\t"
"#n=%1 x=%5=%2 y=%0=%3 t0=%x4\n"
:
"+m" (*y),
"+r" (n), // 1
"+b" (x), // 2
"+b" (y) // 3
:
"wa" (t0), // 4
"m" (*x)
:
"cr0",
"vs32","vs33","vs34","vs35","vs36","vs37", "vs38", "vs39",
"vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
"vs48","vs49","vs50","vs51","vs52","vs53","vs54","vs55",
"vs56","vs57","vs58","vs59","vs60","vs61","vs62","vs63"
);
}

View File

@ -0,0 +1,119 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined(__VEC__) || defined(__ALTIVEC__)
#include "saxpy_microk_power10.c"
#endif
#ifndef HAVE_KERNEL_8
static void saxpy_kernel_64(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{
BLASLONG register i = 0;
while(i < n)
{
y[i] += alpha * x[i];
y[i+1] += alpha * x[i+1];
y[i+2] += alpha * x[i+2];
y[i+3] += alpha * x[i+3];
y[i+4] += alpha * x[i+4];
y[i+5] += alpha * x[i+5];
y[i+6] += alpha * x[i+6];
y[i+7] += alpha * x[i+7];
i+=8 ;
}
}
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
if ( n <= 0 ) return(0);
if ( (inc_x == 1) && (inc_y == 1) )
{
BLASLONG n1 = n & -64;
if ( n1 )
saxpy_kernel_64(n1, x, y, da);
i = n1;
while(i < n)
{
y[i] += da * x[i] ;
i++ ;
}
return(0);
}
BLASLONG n1 = n & -4;
while(i < n1)
{
FLOAT m1 = da * x[ix] ;
FLOAT m2 = da * x[ix+inc_x] ;
FLOAT m3 = da * x[ix+2*inc_x] ;
FLOAT m4 = da * x[ix+3*inc_x] ;
y[iy] += m1 ;
y[iy+inc_y] += m2 ;
y[iy+2*inc_y] += m3 ;
y[iy+3*inc_y] += m4 ;
ix += inc_x*4 ;
iy += inc_y*4 ;
i+=4 ;
}
while(i < n)
{
y[iy] += da * x[ix] ;
ix += inc_x ;
iy += inc_y ;
i++ ;
}
return(0);
}

View File

@ -69,7 +69,7 @@ gotoblas_t TABLE_NAME = {
snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS, snrm2_kTS, sasum_kTS, ssum_kTS, scopy_kTS, sbdot_kTS,
dsdot_kTS, dsdot_kTS,
srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS, srot_kTS, saxpy_kTS, sscal_kTS, sswap_kTS,
sgemv_nTS, sgemv_tTS, sger_kTS, sbgemv_nTS, sbgemv_tTS, sger_kTS,
ssymv_LTS, ssymv_UTS, ssymv_LTS, ssymv_UTS,
sbgemm_kernelTS, sbgemm_betaTS, sbgemm_kernelTS, sbgemm_betaTS,

View File

@ -384,6 +384,14 @@ endif
GEMVDEP = ../l2param.h GEMVDEP = ../l2param.h
ifndef SBGEMVNKERNEL
SBGEMVNKERNEL = sbgemv_n.c
endif
ifndef SBGEMVTKERNEL
SBGEMVTKERNEL = sbgemv_t.c
endif
ifndef SGEMVNKERNEL ifndef SGEMVNKERNEL
SGEMVNKERNEL = sgemv_n.c SGEMVNKERNEL = sgemv_n.c
endif endif

View File

@ -0,0 +1,795 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#ifndef __BF16_COMMON_MACROS
#define __BF16_COMMON_MACROS
#include <immintrin.h>
#define EXTRACT_LOW_256_FROM_512_2X(reg256, reg512) \
reg256##_0 = _mm512_castps512_ps256(reg512##_0); \
reg256##_1 = _mm512_castps512_ps256(reg512##_1);
#define BF16_MATRIX_LOAD_8x32(regArray, a, lda, idx_m, idx_n) \
regArray##_0 = _mm512_loadu_si512(&a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm512_loadu_si512(&a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm512_loadu_si512(&a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm512_loadu_si512(&a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm512_loadu_si512(&a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm512_loadu_si512(&a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm512_loadu_si512(&a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm512_loadu_si512(&a[(idx_m+7)*lda + idx_n]);
#define BF16_MATRIX_LOAD_8x16(regArray, a, lda, idx_m, idx_n) \
regArray##_0 = _mm256_loadu_si256(&a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm256_loadu_si256(&a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm256_loadu_si256(&a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm256_loadu_si256(&a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm256_loadu_si256(&a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm256_loadu_si256(&a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm256_loadu_si256(&a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm256_loadu_si256(&a[(idx_m+7)*lda + idx_n]);
#define BF16_MATRIX_LOAD_8x8(regArray, a, lda, idx_m, idx_n) \
regArray##_0 = _mm_loadu_si128(&a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm_loadu_si128(&a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm_loadu_si128(&a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm_loadu_si128(&a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm_loadu_si128(&a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm_loadu_si128(&a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm_loadu_si128(&a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm_loadu_si128(&a[(idx_m+7)*lda + idx_n]);
#define BF16_MATRIX_LOAD_1x32(regArray, a, lda, idx_m, idx_n) \
regArray = _mm512_loadu_si512(&a[idx_m*lda + idx_n]);
#define BF16_MATRIX_MASKZ_LOAD_8x32(regArray, a, lda, idx_m, idx_n, mask) \
regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
#define BF16_MATRIX_MASKZ_LOAD_8x16(regArray, a, lda, idx_m, idx_n, mask) \
regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
#define BF16_MATRIX_MASKZ_LOAD_8x8(regArray, a, lda, idx_m, idx_n, mask) \
regArray##_0 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]); \
regArray##_4 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
regArray##_5 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+5)*lda + idx_n]); \
regArray##_6 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \
regArray##_7 = _mm_maskz_loadu_epi16(mask, &a[(idx_m+7)*lda + idx_n]);
#define BF16_MATRIX_MASKZ_LOAD_4x32(regArray, a, lda, idx_m, idx_n, mask) \
regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
#define BF16_MATRIX_MASKZ_LOAD_4x16(regArray, a, lda, idx_m, idx_n, mask) \
regArray##_0 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+1)*lda + idx_n]); \
regArray##_2 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
regArray##_3 = _mm256_maskz_loadu_epi16(mask, &a[(idx_m+3)*lda + idx_n]);
#define BF16_MATRIX_MASKZ_LOAD_8x32_2(regArray, a, lda, idx_m, idx_n, mask) \
regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]); \
regArray##_4 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+8)*lda + idx_n]); \
regArray##_5 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+10)*lda + idx_n]); \
regArray##_6 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+12)*lda + idx_n]); \
regArray##_7 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+14)*lda + idx_n]);
#define BF16_MATRIX_MASKZ_LOAD_4x32_2(regArray, a, lda, idx_m, idx_n, mask) \
regArray##_0 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+0)*lda + idx_n]); \
regArray##_1 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+2)*lda + idx_n]); \
regArray##_2 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+4)*lda + idx_n]); \
regArray##_3 = _mm512_maskz_loadu_epi16(mask, &a[(idx_m+6)*lda + idx_n]);
#define BF16_MATRIX_MASKZ_LOAD_1x32(regArray, a, lda, idx_m, idx_n, mask) \
regArray = _mm512_maskz_loadu_epi16(mask, &a[idx_m*lda + idx_n]);
#define BF16_VECTOR_LOAD_1x32(reg, x, idx_n) \
reg = _mm512_loadu_si512(x + idx_n);
#define BF16_VECTOR_LOAD_1x16(reg, x, idx_n) \
reg = _mm256_loadu_si256(x + idx_n);
#define BF16_VECTOR_LOAD_1x8(reg, x, idx_n) \
reg = _mm_loadu_si128(x + idx_n);
#define BF16_VECTOR_MASKZ_LOAD_1x32(reg, x, idx_n, mask) \
reg = _mm512_maskz_loadu_epi16(mask, x + idx_n);
#define BF16_VECTOR_MASKZ_LOAD_1x16(reg, x, idx_n, mask) \
reg = _mm256_maskz_loadu_epi16(mask, x + idx_n);
#define BF16_VECTOR_MASKZ_LOAD_1x8(reg, x, idx_n, mask) \
reg = _mm_maskz_loadu_epi16(mask, x + idx_n);
/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
Input - register array of 8 rows of raw-major matrix
Output - the output of Step 2
Step 1: 2-element interleave for matrix
|a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
|c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
|e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11|e16|e17|f16|f17|e18|e19|f18|f19|e24|e25|f24|f25|e26|e27|f26|f27
|g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11|g16|g17|h16|h17|g18|g19|h18|h19|g24|g25|h24|h25|g26|g27|h26|h27
|a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
|c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
|e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15|e20|e21|f20|f21|e22|e23|f22|f23|e28|e29|f28|f29|e30|e31|f30|f31
|g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15|g20|g21|h20|h21|g22|g23|h22|h23|g28|g29|h28|h29|g30|g31|h30|h31
Step 2: 4-element interleave for matrix
|a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
|a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
|e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9 |e16|e17|f16|f17|g16|g17|h16|h17|e24|e25|f24|f25|g24|g25|h24|h25
|e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11|e18|e19|f18|f19|g18|g19|h18|h19|e26|e27|f26|f27|g26|g27|h26|h27
|a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
|a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
|e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13|e20|e21|f20|f21|g20|g21|h20|h21|e28|e29|f28|f29|g28|g29|h28|h29
|e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15|e22|e23|f22|f23|g22|g23|h22|h23|e30|e31|f30|f31|g30|g31|h30|h31
*/
#define BF16_INTERLEAVE_8x32(regArray) \
regArray##_8 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \
regArray##_9 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \
regArray##_10 = _mm512_unpacklo_epi32(regArray##_4, regArray##_5); \
regArray##_11 = _mm512_unpacklo_epi32(regArray##_6, regArray##_7); \
regArray##_12 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \
regArray##_13 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \
regArray##_14 = _mm512_unpackhi_epi32(regArray##_4, regArray##_5); \
regArray##_15 = _mm512_unpackhi_epi32(regArray##_6, regArray##_7); \
\
regArray##_0 = _mm512_unpacklo_epi64(regArray##_8, regArray##_9); \
regArray##_1 = _mm512_unpackhi_epi64(regArray##_8, regArray##_9); \
regArray##_2 = _mm512_unpacklo_epi64(regArray##_10, regArray##_11); \
regArray##_3 = _mm512_unpackhi_epi64(regArray##_10, regArray##_11); \
regArray##_4 = _mm512_unpacklo_epi64(regArray##_12, regArray##_13); \
regArray##_5 = _mm512_unpackhi_epi64(regArray##_12, regArray##_13); \
regArray##_6 = _mm512_unpacklo_epi64(regArray##_14, regArray##_15); \
regArray##_7 = _mm512_unpackhi_epi64(regArray##_14, regArray##_15);
/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
Input - register array of 8 rows of raw-major matrix
Output - the output of Step 2
Step 1: 2-element interleave for matrix
|a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
|c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
|e0|e1|f0|f1|e2|e3|f2|f3|e8 |e9 |f8 |f9 |e10|e11|f10|f11
|g0|g1|h0|h1|g2|g3|h2|h3|g8 |g9 |h8 |h9 |g10|g11|h10|h11
|a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
|c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
|e4|e5|f4|f5|e6|e7|f6|f7|e12|e13|f12|f13|e14|e15|f14|f15
|g4|g5|h4|h5|g6|g7|h6|h7|g12|g13|h12|h13|g14|g15|h14|h15
Step 2: 4-element interleave for matrix
|a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
|a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
|e0|e1|f0|f1|g0|g1|h0|h1|e8 |e9 |f8 |f9 |g8 |g9 |h8 |h9
|e2|e3|f2|f3|g2|g3|h2|h3|e10|e11|f10|f11|g10|g11|h10|h11
|a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
|a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
|e4|e5|f4|f5|g4|g5|h4|h5|e12|e13|f12|f13|g12|g13|h12|h13
|e6|e7|f6|f7|g6|g7|h6|h7|e14|e15|f14|f15|g14|g15|h14|h15
*/
#define BF16_INTERLEAVE_8x16(regArray) \
regArray##_8 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \
regArray##_9 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \
regArray##_10 = _mm256_unpacklo_epi32(regArray##_4, regArray##_5); \
regArray##_11 = _mm256_unpacklo_epi32(regArray##_6, regArray##_7); \
regArray##_12 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \
regArray##_13 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \
regArray##_14 = _mm256_unpackhi_epi32(regArray##_4, regArray##_5); \
regArray##_15 = _mm256_unpackhi_epi32(regArray##_6, regArray##_7); \
\
regArray##_0 = _mm256_unpacklo_epi64(regArray##_8, regArray##_9); \
regArray##_1 = _mm256_unpackhi_epi64(regArray##_8, regArray##_9); \
regArray##_2 = _mm256_unpacklo_epi64(regArray##_10, regArray##_11); \
regArray##_3 = _mm256_unpackhi_epi64(regArray##_10, regArray##_11); \
regArray##_4 = _mm256_unpacklo_epi64(regArray##_12, regArray##_13); \
regArray##_5 = _mm256_unpackhi_epi64(regArray##_12, regArray##_13); \
regArray##_6 = _mm256_unpacklo_epi64(regArray##_14, regArray##_15); \
regArray##_7 = _mm256_unpackhi_epi64(regArray##_14, regArray##_15);
/* 2-step interleave for matrix against 8 rows with 32 BF16 elements per row
Input - register array of 8 rows of raw-major matrix
Output - the output of Step 2
Step 1: 2-element interleave for matrix
|a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11|a16|a17|b16|b17|a18|a19|b18|b19|a24|a25|b24|b25|a26|a27|b26|b27
|c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11|c16|c17|d16|d17|c18|c19|d18|d19|c24|c25|d24|d25|c26|c27|d26|d27
|a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15|a20|a21|b20|b21|a22|a23|b22|b23|a28|a29|b28|b29|a30|a31|b30|b31
|c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15|c20|c21|d20|d21|c22|c23|d22|d23|c28|c29|d28|d29|c30|c31|d30|d31
Step 2: 4-element interleave for matrix
|a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9 |a16|a17|b16|b17|c16|c17|d16|d17|a24|a25|b24|b25|c24|c25|d24|d25
|a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11|a18|a19|b18|b19|c18|c19|d18|d19|a26|a27|b26|b27|c26|c27|d26|d27
|a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13|a20|a21|b20|b21|c20|c21|d20|d21|a28|a29|b28|b29|c28|c29|d28|d29
|a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15|a22|a23|b22|b23|c22|c23|d22|d23|a30|a31|b30|b31|c30|c31|d30|d31
*/
#define BF16_INTERLEAVE_4x32(regArray) \
regArray##_4 = _mm512_unpacklo_epi32(regArray##_0, regArray##_1); \
regArray##_5 = _mm512_unpacklo_epi32(regArray##_2, regArray##_3); \
regArray##_6 = _mm512_unpackhi_epi32(regArray##_0, regArray##_1); \
regArray##_7 = _mm512_unpackhi_epi32(regArray##_2, regArray##_3); \
\
regArray##_0 = _mm512_unpacklo_epi64(regArray##_4, regArray##_5); \
regArray##_1 = _mm512_unpackhi_epi64(regArray##_4, regArray##_5); \
regArray##_2 = _mm512_unpacklo_epi64(regArray##_6, regArray##_7); \
regArray##_3 = _mm512_unpackhi_epi64(regArray##_6, regArray##_7);
/* 2-step interleave for matrix against 8 rows with 16 BF16 elements per row
Input - register array of 8 rows of raw-major matrix
Output - the output of Step 2
Step 1: 2-element interleave for matrix
|a0|a1|b0|b1|a2|a3|b2|b3|a8 |a9 |b8 |b9 |a10|a11|b10|b11
|c0|c1|d0|d1|c2|c3|d2|d3|c8 |c9 |d8 |d9 |c10|c11|d10|d11
|a4|a5|b4|b5|a6|a7|b6|b7|a12|a13|b12|b13|a14|a15|b14|b15
|c4|c5|d4|d5|c6|c7|d6|d7|c12|c13|d12|d13|c14|c15|d14|d15
Step 2: 4-element interleave for matrix
|a0|a1|b0|b1|c0|c1|d0|d1|a8 |a9 |b8 |b9 |c8 |c9 |d8 |d9
|a2|a3|b2|b3|c2|c3|d2|d3|a10|a11|b10|b11|c10|c11|d10|d11
|a4|a5|b4|b5|c4|c5|d4|d5|a12|a13|b12|b13|c12|c13|d12|d13
|a6|a7|b6|b7|c6|c7|d6|d7|a14|a15|b14|b15|c14|c15|d14|d15
*/
#define BF16_INTERLEAVE_4x16(regArray) \
regArray##_4 = _mm256_unpacklo_epi32(regArray##_0, regArray##_1); \
regArray##_5 = _mm256_unpacklo_epi32(regArray##_2, regArray##_3); \
regArray##_6 = _mm256_unpackhi_epi32(regArray##_0, regArray##_1); \
regArray##_7 = _mm256_unpackhi_epi32(regArray##_2, regArray##_3); \
\
regArray##_0 = _mm256_unpacklo_epi64(regArray##_4, regArray##_5); \
regArray##_1 = _mm256_unpackhi_epi64(regArray##_4, regArray##_5); \
regArray##_2 = _mm256_unpacklo_epi64(regArray##_6, regArray##_7); \
regArray##_3 = _mm256_unpackhi_epi64(regArray##_6, regArray##_7);
/* 2-step interleave for x with 32 BF16 elements
Input - original vector
Output - the output of Step 2
Step 1: 2-element interleave for x:
|x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11|x16|x17|x16|x17|x18|x19|x18|x19|x24|x25|x24|x25|x26|x27|x26|x27
|x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15|x20|x21|x20|x21|x22|x23|x22|x23|x28|x29|x28|x29|x30|x31|x30|x31
Step 2: 4-element interleave for x:
|x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9 |x16|x17|x16|x17|x16|x17|x16|x17|x24|x25|x24|x25|x24|x25|x24|x25
|x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11|x18|x19|x18|x19|x18|x19|x18|x19|x26|x27|x26|x27|x26|x27|x26|x27
|x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13|x20|x21|x20|x21|x20|x21|x20|x21|x28|x29|x28|x29|x28|x29|x28|x29
|x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15|x22|x23|x22|x23|x22|x23|x22|x23|x30|x31|x30|x31|x30|x31|x30|x31
*/
#define BF16_INTERLEAVE_1x32(regArray) \
regArray##_1 = _mm512_unpacklo_epi32(regArray##_0, regArray##_0); \
regArray##_3 = _mm512_unpackhi_epi32(regArray##_0, regArray##_0); \
\
regArray##_0 = _mm512_unpacklo_epi64(regArray##_1, regArray##_1); \
regArray##_1 = _mm512_unpackhi_epi64(regArray##_1, regArray##_1); \
regArray##_2 = _mm512_unpacklo_epi64(regArray##_3, regArray##_3); \
regArray##_3 = _mm512_unpackhi_epi64(regArray##_3, regArray##_3);
/* 2-step interleave for x with 16 BF16 elements
Input - original vector
Output - the output of Step 2
Step 1: 2-element interleave for x:
|x0|x1|x0|x1|x2|x3|x2|x3|x8 |x9 |x8 |x9 |x10|x11|x10|x11
|x4|x5|x4|x5|x6|x7|x6|x7|x12|x13|x12|x13|x14|x15|x14|x15
Step 2: 4-element interleave for x:
|x0|x1|x0|x1|x0|x1|x0|x1|x8 |x9 |x8 |x9 |x8 |x9 |x8 |x9
|x2|x3|x2|x3|x2|x3|x2|x3|x10|x11|x10|x11|x10|x11|x10|x11
|x4|x5|x4|x5|x4|x5|x4|x5|x12|x13|x12|x13|x12|x13|x12|x13
|x6|x7|x6|x7|x6|x7|x6|x7|x14|x15|x14|x15|x14|x15|x14|x15
*/
#define BF16_INTERLEAVE_1x16(regArray) \
regArray##_1 = _mm256_unpacklo_epi32(regArray##_0, regArray##_0); \
regArray##_3 = _mm256_unpackhi_epi32(regArray##_0, regArray##_0); \
\
regArray##_0 = _mm256_unpacklo_epi64(regArray##_1, regArray##_1); \
regArray##_1 = _mm256_unpackhi_epi64(regArray##_1, regArray##_1); \
regArray##_2 = _mm256_unpacklo_epi64(regArray##_3, regArray##_3); \
regArray##_3 = _mm256_unpackhi_epi64(regArray##_3, regArray##_3);
/* 1-step interleave to exchange the high-256s bit and low-256 bits of 4 pair of registers
|a0|a1|...|a14|a15|i0|i1|...|i14|i15|
|b0|b1|...|b14|b15|j0|j1|...|j14|j15|
|c0|c1|...|c14|c15|k0|k1|...|k14|k15|
|d0|d1|...|d14|d15|l0|l1|...|l14|l15|
|e0|e1|...|e14|e15|m0|m1|...|m14|m15|
|f0|f1|...|f14|f15|n0|n1|...|n14|n15|
|g0|g1|...|g14|g15|o0|o1|...|o14|o15|
|h0|h1|...|h14|h15|p0|p1|...|p14|p15|
*/
#define BF16_INTERLEAVE256_8x32(regArray) \
regArray##_0 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0x44); \
regArray##_1 = _mm512_shuffle_i32x4(regArray##_8, regArray##_12, 0xee); \
regArray##_2 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0x44); \
regArray##_3 = _mm512_shuffle_i32x4(regArray##_9, regArray##_13, 0xee); \
regArray##_4 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0x44); \
regArray##_5 = _mm512_shuffle_i32x4(regArray##_10, regArray##_14, 0xee); \
regArray##_6 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0x44); \
regArray##_7 = _mm512_shuffle_i32x4(regArray##_11, regArray##_15, 0xee);
/* 1-step interleave to exchange the high-256s bit and low-256 bits of 2 pair of registers
|a0|a1|...|a14|a15|e0|e1|...|e14|e15|
|b0|b1|...|b14|b15|f0|f1|...|f14|f15|
|c0|c1|...|c14|c15|g0|g1|...|g14|g15|
|d0|d1|...|d14|d15|h0|h1|...|h14|h15|
*/
#define BF16_INTERLEAVE256_4x32(regArray) \
regArray##_0 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0x44); \
regArray##_1 = _mm512_shuffle_i32x4(regArray##_4, regArray##_6, 0xee); \
regArray##_2 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0x44); \
regArray##_3 = _mm512_shuffle_i32x4(regArray##_5, regArray##_7, 0xee);
#define BF16_PERMUTE_8x32(idx, regArray) \
regArray##_8 = _mm512_permutexvar_epi16(idx, regArray##_0); \
regArray##_9 = _mm512_permutexvar_epi16(idx, regArray##_1); \
regArray##_10 = _mm512_permutexvar_epi16(idx, regArray##_2); \
regArray##_11 = _mm512_permutexvar_epi16(idx, regArray##_3); \
regArray##_12 = _mm512_permutexvar_epi16(idx, regArray##_4); \
regArray##_13 = _mm512_permutexvar_epi16(idx, regArray##_5); \
regArray##_14 = _mm512_permutexvar_epi16(idx, regArray##_6); \
regArray##_15 = _mm512_permutexvar_epi16(idx, regArray##_7);
#define BF16_PERMUTE_8x32_2(idx, regArray) \
regArray##_8 = _mm512_permutexvar_epi32(idx, regArray##_0); \
regArray##_9 = _mm512_permutexvar_epi32(idx, regArray##_1); \
regArray##_10 = _mm512_permutexvar_epi32(idx, regArray##_2); \
regArray##_11 = _mm512_permutexvar_epi32(idx, regArray##_3); \
regArray##_12 = _mm512_permutexvar_epi32(idx, regArray##_4); \
regArray##_13 = _mm512_permutexvar_epi32(idx, regArray##_5); \
regArray##_14 = _mm512_permutexvar_epi32(idx, regArray##_6); \
regArray##_15 = _mm512_permutexvar_epi32(idx, regArray##_7);
#define BF16_PERMUTE_4x32(idx, regArray) \
regArray##_4 = _mm512_permutexvar_epi16(idx, regArray##_0); \
regArray##_5 = _mm512_permutexvar_epi16(idx, regArray##_1); \
regArray##_6 = _mm512_permutexvar_epi16(idx, regArray##_2); \
regArray##_7 = _mm512_permutexvar_epi16(idx, regArray##_3);
#define BF16_PERMUTE_4x32_2(idx, regArray) \
regArray##_4 = _mm512_permutexvar_epi32(idx, regArray##_0); \
regArray##_5 = _mm512_permutexvar_epi32(idx, regArray##_1); \
regArray##_6 = _mm512_permutexvar_epi32(idx, regArray##_2); \
regArray##_7 = _mm512_permutexvar_epi32(idx, regArray##_3);
/* Calculate the dot result for 2-step interleaved matrix and vector
(Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
*/
#define BF16_2STEP_INTERLEAVED_DOT_8x32(accumArray, matArray, xArray) \
accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \
accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_2, (__m512bh) xArray##_0); \
accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \
accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_1); \
accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_4, (__m512bh) xArray##_2); \
accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_6, (__m512bh) xArray##_2); \
accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_5, (__m512bh) xArray##_3); \
accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_7, (__m512bh) xArray##_3);
/* Calculate the dot result for 2-step interleaved matrix and vector
(Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
*/
#define BF16_2STEP_INTERLEAVED_DOT_8x16(accumArray, matArray, xArray) \
accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \
accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_2, (__m256bh) xArray##_0); \
accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \
accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_1); \
accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_4, (__m256bh) xArray##_2); \
accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_6, (__m256bh) xArray##_2); \
accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_5, (__m256bh) xArray##_3); \
accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_7, (__m256bh) xArray##_3);
/* Calculate the dot result for 2-step interleaved matrix and vector
(Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
*/
#define BF16_2STEP_INTERLEAVED_DOT_4x32(accumArray, matArray, xArray) \
accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray##_0); \
accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray##_1); \
accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_2, (__m512bh) xArray##_2); \
accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_3, (__m512bh) xArray##_3);
/* Calculate the dot result for 2-step interleaved matrix and vector
(Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
*/
#define BF16_2STEP_INTERLEAVED_DOT_4x16(accumArray, matArray, xArray) \
accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray##_0); \
accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray##_1); \
accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_2, (__m256bh) xArray##_2); \
accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_3, (__m256bh) xArray##_3);
/* Calculate the dot result for matrix and vector at 32 elements per row
(Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
*/
#define BF16_DOT_8x32(accumArray, matArray, xArray) \
accumArray##_0 = _mm512_dpbf16_ps(accumArray##_0, (__m512bh) matArray##_0, (__m512bh) xArray); \
accumArray##_1 = _mm512_dpbf16_ps(accumArray##_1, (__m512bh) matArray##_1, (__m512bh) xArray); \
accumArray##_2 = _mm512_dpbf16_ps(accumArray##_2, (__m512bh) matArray##_2, (__m512bh) xArray); \
accumArray##_3 = _mm512_dpbf16_ps(accumArray##_3, (__m512bh) matArray##_3, (__m512bh) xArray); \
accumArray##_4 = _mm512_dpbf16_ps(accumArray##_4, (__m512bh) matArray##_4, (__m512bh) xArray); \
accumArray##_5 = _mm512_dpbf16_ps(accumArray##_5, (__m512bh) matArray##_5, (__m512bh) xArray); \
accumArray##_6 = _mm512_dpbf16_ps(accumArray##_6, (__m512bh) matArray##_6, (__m512bh) xArray); \
accumArray##_7 = _mm512_dpbf16_ps(accumArray##_7, (__m512bh) matArray##_7, (__m512bh) xArray);
/* Calculate the dot result for matrix and vector at 32 elements per row
(Assume throughput for _mm512_dpbf16_ps is 0.5, tunable per platform)
*/
#define BF16_DOT_1x32(accumArray, matArray, xArray) \
accumArray = _mm512_dpbf16_ps(accumArray, (__m512bh) matArray, (__m512bh) xArray);
/* Calculate the dot result for matrix and vector at 16 elements per row
(Assume throughput for _mm256_dpbf16_ps is 0.5, tunable per platform)
*/
#define BF16_DOT_8x16(accumArray, matArray, xArray) \
accumArray##_0 = _mm256_dpbf16_ps(accumArray##_0, (__m256bh) matArray##_0, (__m256bh) xArray); \
accumArray##_1 = _mm256_dpbf16_ps(accumArray##_1, (__m256bh) matArray##_1, (__m256bh) xArray); \
accumArray##_2 = _mm256_dpbf16_ps(accumArray##_2, (__m256bh) matArray##_2, (__m256bh) xArray); \
accumArray##_3 = _mm256_dpbf16_ps(accumArray##_3, (__m256bh) matArray##_3, (__m256bh) xArray); \
accumArray##_4 = _mm256_dpbf16_ps(accumArray##_4, (__m256bh) matArray##_4, (__m256bh) xArray); \
accumArray##_5 = _mm256_dpbf16_ps(accumArray##_5, (__m256bh) matArray##_5, (__m256bh) xArray); \
accumArray##_6 = _mm256_dpbf16_ps(accumArray##_6, (__m256bh) matArray##_6, (__m256bh) xArray); \
accumArray##_7 = _mm256_dpbf16_ps(accumArray##_7, (__m256bh) matArray##_7, (__m256bh) xArray);
/* 2-step interleave for matrix against 8 rows with 16 fp32 elements per row
Input - register array of 8 rows of raw-major matrix
Output - the output of Step 2
Step 1: 2-element interleave for matrix
|a0|b0|a1|b1|a4|b4|a5|b5|a8 |b8 |a9 |b9 |a12|b12|a13|b13|
|c0|d0|c1|d1|c4|d4|c5|d5|c8 |d8 |c9 |d9 |c12|d12|c13|d13|
|e0|f0|e1|f1|e4|f4|e5|f5|e8 |f8 |e9 |f9 |e12|f12|e13|f13|
|g0|h0|g1|h1|g4|h4|g5|h5|g8 |h8 |g9 |h9 |g12|h12|g13|h13|
|a2|b2|a3|b3|a6|b6|a7|b7|a10|b10|a11|b11|a14|b14|a15|b15|
|c2|d2|c3|d3|c6|d6|c7|d7|c10|d10|c11|d11|c14|d14|c15|d15|
|e2|f2|e3|f3|e6|f6|e7|f7|e10|f10|e11|f11|e14|f14|e15|f15|
|g2|h2|g3|h3|g6|h6|g7|h7|g10|h10|g11|h11|g14|h14|g15|h15|
Step 2: 4-element interleave for matrix
|a0|b0|c0|d0|a4|b4|c4|d4|a8 |b8 |c8 |d8 |a12|b12|c12|d12|
|a1|b1|c1|d1|a5|b5|c5|d5|a9 |b9 |c9 |d9 |a13|b13|c13|d13|
|e0|f0|g0|h0|e4|f4|g4|h4|e8 |f8 |g8 |h8 |e12|f12|g12|h12|
|e1|f1|g1|h1|e5|f5|g5|h5|e9 |f9 |g9 |h9 |e13|f13|g13|h13|
|a2|b2|c2|d2|a6|b6|c6|d6|a10|b10|c10|d10|a14|b14|c14|d14|
|a3|b3|c3|d3|a7|b7|c7|d7|a11|b11|c11|d11|a15|b15|c15|d15|
|e2|f2|g2|h2|e6|f6|g6|h6|e10|f10|g10|h10|e14|f14|g14|h14|
|e3|f3|g3|h3|e7|f7|g7|h7|e11|f11|g11|h11|e15|f15|g15|h15|
*/
#define FP32_INTERLEAVE_8x16(regArray) \
regArray##_8 = _mm512_unpacklo_ps(regArray##_0, regArray##_1); \
regArray##_9 = _mm512_unpacklo_ps(regArray##_2, regArray##_3); \
regArray##_10 = _mm512_unpacklo_ps(regArray##_4, regArray##_5); \
regArray##_11 = _mm512_unpacklo_ps(regArray##_6, regArray##_7); \
regArray##_12 = _mm512_unpackhi_ps(regArray##_0, regArray##_1); \
regArray##_13 = _mm512_unpackhi_ps(regArray##_2, regArray##_3); \
regArray##_14 = _mm512_unpackhi_ps(regArray##_4, regArray##_5); \
regArray##_15 = _mm512_unpackhi_ps(regArray##_6, regArray##_7); \
\
regArray##_0 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \
regArray##_1 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_8, (__m512d) regArray##_9); \
regArray##_4 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
regArray##_5 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_10, (__m512d) regArray##_11); \
regArray##_2 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
regArray##_3 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_12, (__m512d) regArray##_13); \
regArray##_6 = (__m512) _mm512_unpacklo_pd((__m512d) regArray##_14, (__m512d) regArray##_15); \
regArray##_7 = (__m512) _mm512_unpackhi_pd((__m512d) regArray##_14, (__m512d) regArray##_15);
#define FP32_INTERLEAVE_8x16_ARRAY(regArray) \
regArray[8] = _mm512_unpacklo_ps(regArray[0], regArray[1]); \
regArray[9] = _mm512_unpacklo_ps(regArray[2], regArray[3]); \
regArray[10] = _mm512_unpacklo_ps(regArray[4], regArray[5]); \
regArray[11] = _mm512_unpacklo_ps(regArray[6], regArray[7]); \
regArray[12] = _mm512_unpackhi_ps(regArray[0], regArray[1]); \
regArray[13] = _mm512_unpackhi_ps(regArray[2], regArray[3]); \
regArray[14] = _mm512_unpackhi_ps(regArray[4], regArray[5]); \
regArray[15] = _mm512_unpackhi_ps(regArray[6], regArray[7]); \
\
regArray[0] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[8], (__m512d) regArray[9]); \
regArray[1] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[8], (__m512d) regArray[9]); \
regArray[4] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
regArray[5] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[10], (__m512d) regArray[11]); \
regArray[2] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
regArray[3] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[12], (__m512d) regArray[13]); \
regArray[6] = (__m512) _mm512_unpacklo_pd((__m512d) regArray[14], (__m512d) regArray[15]); \
regArray[7] = (__m512) _mm512_unpackhi_pd((__m512d) regArray[14], (__m512d) regArray[15]);
/* 2-step interleave for matrix against 8 rows with 8 fp32 elements per row
Input - register array of 8 rows of raw-major matrix
Output - the output of Step 2
Step 1: 2-element interleave for matrix
|a0|b0|a1|b1|a4|b4|a5|b5|
|c0|d0|c1|d1|c4|d4|c5|d5|
|e0|f0|e1|f1|e4|f4|e5|f5|
|g0|h0|g1|h1|g4|h4|g5|h5|
|a2|b2|a3|b3|a6|b6|a7|b7|
|c2|d2|c3|d3|c6|d6|c7|d7|
|e2|f2|e3|f3|e6|f6|e7|f7|
|g2|h2|g3|h3|g6|h6|g7|h7|
Step 2: 4-element interleave for matrix
|a0|b0|c0|d0|a4|b4|c4|d4|
|a1|b1|c1|d1|a5|b5|c5|d5|
|e0|f0|g0|h0|e4|f4|g4|h4|
|e1|f1|g1|h1|e5|f5|g5|h5|
|a2|b2|c2|d2|a6|b6|c6|d6|
|a3|b3|c3|d3|a7|b7|c7|d7|
|e2|f2|g2|h2|e6|f6|g6|h6|
|e3|f3|g3|h3|e7|f7|g7|h7|
*/
#define FP32_INTERLEAVE_8x8(regArray) \
regArray##_8 = _mm256_unpacklo_ps(regArray##_0, regArray##_1); \
regArray##_9 = _mm256_unpacklo_ps(regArray##_2, regArray##_3); \
regArray##_10 = _mm256_unpacklo_ps(regArray##_4, regArray##_5); \
regArray##_11 = _mm256_unpacklo_ps(regArray##_6, regArray##_7); \
regArray##_12 = _mm256_unpackhi_ps(regArray##_0, regArray##_1); \
regArray##_13 = _mm256_unpackhi_ps(regArray##_2, regArray##_3); \
regArray##_14 = _mm256_unpackhi_ps(regArray##_4, regArray##_5); \
regArray##_15 = _mm256_unpackhi_ps(regArray##_6, regArray##_7); \
\
regArray##_0 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \
regArray##_1 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_8, (__m256d) regArray##_9); \
regArray##_4 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
regArray##_5 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_10, (__m256d) regArray##_11); \
regArray##_2 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
regArray##_3 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_12, (__m256d) regArray##_13); \
regArray##_6 = (__m256) _mm256_unpacklo_pd((__m256d) regArray##_14, (__m256d) regArray##_15); \
regArray##_7 = (__m256) _mm256_unpackhi_pd((__m256d) regArray##_14, (__m256d) regArray##_15);
/* Accumulate the result for 2 batch of 4-registers
*/
#define FP32_ACCUM2_8x16(regArray) \
regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_1); \
regArray##_2 = _mm512_add_ps(regArray##_2, regArray##_3); \
regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_5); \
regArray##_6 = _mm512_add_ps(regArray##_6, regArray##_7); \
regArray##_0 = _mm512_add_ps(regArray##_0, regArray##_2); \
regArray##_4 = _mm512_add_ps(regArray##_4, regArray##_6);
#define FP32_ACCUM2_8x16_ARRAY(regArray) \
regArray[0] = _mm512_add_ps(regArray[0], regArray[1]); \
regArray[2] = _mm512_add_ps(regArray[2], regArray[3]); \
regArray[4] = _mm512_add_ps(regArray[4], regArray[5]); \
regArray[6] = _mm512_add_ps(regArray[6], regArray[7]); \
regArray[0] = _mm512_add_ps(regArray[0], regArray[2]); \
regArray[4] = _mm512_add_ps(regArray[4], regArray[6]);
/* Accumulate the result for 2 batch of 4-registers
*/
#define FP32_ACCUM2_8x8(regArray) \
regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_1); \
regArray##_2 = _mm256_add_ps(regArray##_2, regArray##_3); \
regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_5); \
regArray##_6 = _mm256_add_ps(regArray##_6, regArray##_7); \
regArray##_0 = _mm256_add_ps(regArray##_0, regArray##_2); \
regArray##_4 = _mm256_add_ps(regArray##_4, regArray##_6);
/* Store 16 (alpha * result + beta * y) to y
*/
#define STORE16_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \
regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_loadu_ps(targetAddr))); \
_mm512_storeu_ps(targetAddr, regResult);
/* Masked store 16 (alpha * result + beta * y) to y
*/
#define STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \
regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_mul_ps(BETAVECTOR, _mm512_maskz_loadu_ps(mask, targetAddr))); \
_mm512_mask_storeu_ps(targetAddr, mask, regResult);
/* Store 8 (alpha * result + beta * y) to y
*/
#define STORE8_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \
regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_loadu_ps(targetAddr))); \
_mm256_storeu_ps(targetAddr, regResult);
/* Masked store 8 (alpha * result + beta * y) to y
*/
#define STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \
regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_mul_ps(_mm512_castps512_ps256(BETAVECTOR), _mm256_maskz_loadu_ps(mask, targetAddr))); \
_mm256_mask_storeu_ps(targetAddr, mask, regResult);
/* Store 4 (alpha * result + beta * y) to y
*/
#define STORE4_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr) \
regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_loadu_ps(targetAddr))); \
_mm_storeu_ps(targetAddr, regResult);
/* Masked store 4 (alpha * result + beta * y) to y
*/
#define STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA(regResult, targetAddr, mask) \
regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_mul_ps(_mm512_castps512_ps128(BETAVECTOR), _mm_maskz_loadu_ps(mask, targetAddr))); \
_mm_mask_storeu_ps(targetAddr, mask, regResult);
/* Store 16 (alpha * result + y) to y
*/
#define STORE16_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \
regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_loadu_ps(targetAddr)); \
_mm512_storeu_ps(targetAddr, regResult);
/* Masked store 16 (alpha * result + y) to y
*/
#define STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \
regResult = _mm512_fmadd_ps(ALPHAVECTOR, regResult, _mm512_maskz_loadu_ps(mask, targetAddr)); \
_mm512_mask_storeu_ps(targetAddr, mask, regResult);
/* Store 8 (alpha * result + y) to y
*/
#define STORE8_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \
regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_loadu_ps(targetAddr)); \
_mm256_storeu_ps(targetAddr, regResult);
/* Masked store 8 (alpha * result + y) to y
*/
#define STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \
regResult = _mm256_fmadd_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult, _mm256_maskz_loadu_ps(mask, targetAddr)); \
_mm256_mask_storeu_ps(targetAddr, mask, regResult);
/* Store 4 (alpha * result + y) to y
*/
#define STORE4_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr) \
regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_loadu_ps(targetAddr)); \
_mm_storeu_ps(targetAddr, regResult);
/* Masked store 4 (alpha * result + y) to y
*/
#define STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE(regResult, targetAddr, mask) \
regResult = _mm_fmadd_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult, _mm_maskz_loadu_ps(mask, targetAddr)); \
_mm_mask_storeu_ps(targetAddr, mask, regResult);
/* Store 16 (alpha * result) to y
*/
#define STORE16_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \
_mm512_storeu_ps(targetAddr, _mm512_mul_ps(ALPHAVECTOR, regResult));
/* Masked store 16 (alpha * result) to y
*/
#define STORE16_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \
_mm512_mask_storeu_ps(targetAddr, mask, _mm512_mul_ps(ALPHAVECTOR, regResult));
/* Store 8 (alpha * result) to y
*/
#define STORE8_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \
_mm256_storeu_ps(targetAddr, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
/* Masked store 8 (alpha * result) to y
*/
#define STORE8_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \
_mm256_mask_storeu_ps(targetAddr, mask, _mm256_mul_ps(_mm512_castps512_ps256(ALPHAVECTOR), regResult));
/* Store 4 (alpha * result) to y
*/
#define STORE4_COMPLETE_RESULT_ALPHA(regResult, targetAddr) \
_mm_storeu_ps(targetAddr, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
/* Masked store 4 (alpha * result) to y
*/
#define STORE4_MASK_COMPLETE_RESULT_ALPHA(regResult, targetAddr, mask) \
_mm_mask_storeu_ps(targetAddr, mask, _mm_mul_ps(_mm512_castps512_ps128(ALPHAVECTOR), regResult));
/* Store 16 result to y
*/
#define STORE16_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \
_mm512_storeu_ps(targetAddr, regResult);
/* Masked store 16 result to y
*/
#define STORE16_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \
_mm512_mask_storeu_ps(targetAddr, mask, regResult);
/* Store 8 result to y
*/
#define STORE8_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \
_mm256_storeu_ps(targetAddr, regResult);
/* Masked store 8 result to y
*/
#define STORE8_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \
_mm256_mask_storeu_ps(targetAddr, mask, regResult);
/* Store 4 result to y
*/
#define STORE4_COMPLETE_RESULT_DIRECT(regResult, targetAddr) \
_mm_storeu_ps(targetAddr, regResult);
/* Masked store 4 result to y
*/
#define STORE4_MASK_COMPLETE_RESULT_DIRECT(regResult, targetAddr, mask) \
_mm_mask_storeu_ps(targetAddr, mask, regResult);
#endif

137
kernel/x86_64/sbgemv_n.c Normal file
View File

@ -0,0 +1,137 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined (COOPERLAKE)
#include "sbgemv_n_microk_cooperlake.c"
#endif
#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \
ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
#define ALIGN64_FREE(ptr) \
free(ptr)
#ifndef HAVE_SBGEMV_N_ACCL_KERNEL
static void sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
{
BLASLONG offset_lda, offset_m;
float accum = 0.0;
float tmp_x = 0.0;
bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
float * a_fp32 = malloc(sizeof(float)*m*n);
float * x_fp32 = malloc(sizeof(float)*n);
for (BLASLONG j=0; j<n; j++) {
offset_lda = lda * j;
offset_m = m * j;
for (BLASLONG i=0; i<m; i++) {
a_bf16[offset_m + i] = a[offset_lda + i];
}
}
SBF16TOS_K(n, x, 1, x_fp32, 1);
SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
for (BLASLONG i=0; i<m; i++) {
accum = 0.0;
for (BLASLONG j=0; j<n; j++) {
accum += a_fp32[j*m + i] * x_fp32[j];
}
if (beta == ZERO) {
y[i] = alpha * accum;
} else {
y[i] = alpha * accum + beta * y[i];
}
}
free(a_bf16);
free(a_fp32);
free(x_fp32);
}
#endif
static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
{
for(BLASLONG i=0; i<n; i++) {
target[i] = src[i*inc];
}
}
static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
{
for(BLASLONG i=0; i<n; i++) {
target[i] = src[i*inc];
}
}
static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
{
for(BLASLONG i=0; i<n; i++) {
target[i*inc] = src[i];
}
}
int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
{
if ( m < 1 || n < 1) return(0);
bfloat16 * xbuffer_align = x;
float * ybuffer_align = y;
bfloat16 * xbuffer = NULL;
float * ybuffer = NULL;
if (incx != 1) {
ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
bf16_compress_vector(n, x, xbuffer_align, incx);
}
if (incy != 1) {
ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
if (beta != ZERO) {
fp32_compress_vector(m, y, ybuffer_align, incy);
}
}
sbgemv_kernel_n(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
if (incy != 1) {
fp32_expand_vector(m, ybuffer_align, y, incy);
ALIGN64_FREE(ybuffer);
}
if (incx != 1) {
ALIGN64_FREE(xbuffer);
}
return(0);
}

View File

@ -0,0 +1,76 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_SBGEMV_N_ACCL_KERNEL 1
#include "common.h"
#include <immintrin.h>
// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
#undef ZERO_BETA
#undef ONE_BETA
#undef ONE_ALPHA
#include "sbgemv_n_microk_cooperlake_template.c"
// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
#undef ZERO_BETA
#define ONE_BETA 1
#undef ONE_ALPHA
#include "sbgemv_n_microk_cooperlake_template.c"
// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
#define ZERO_BETA 1
#undef ONE_ALPHA
#include "sbgemv_n_microk_cooperlake_template.c"
// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
#define ZERO_BETA 1
#define ONE_ALPHA 1
#include "sbgemv_n_microk_cooperlake_template.c"
static int sbgemv_kernel_n(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
{
if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data
if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA
sbgemv_kernel_32xN_lda_direct(m, n, alpha, a, lda, x, y);
} else { // ALPHA != 1.0, need to multipy ALPHA
sbgemv_kernel_32xN_lda_direct_alpha(m, n, alpha, a, lda, x, y);
}
} else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
if (beta == ONE) {
sbgemv_kernel_32xN_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
} else {
sbgemv_kernel_32xN_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
}
}
return 0;
}
#endif

View File

@ -0,0 +1,234 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include <immintrin.h>
#include "common.h"
// Include common macros for BF16 based operations with IA intrinsics
#include "bf16_common_macros.h"
#ifndef ZERO_BETA // Beta is non-zero
#ifndef ONE_BETA // BETA is not ONE
#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_BETA
#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_BETA
#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_BETA
#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_BETA
#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_BETA
#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_BETA
#else // BETA is ONE
#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA_ONE
#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA_ONE
#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA_ONE
#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA_ONE
#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA_ONE
#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA_ONE
#endif
#else // BETA is zero
#ifndef ONE_ALPHA // ALPHA is not ONE
#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_ALPHA
#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_ALPHA
#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_ALPHA
#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_ALPHA
#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_ALPHA
#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_ALPHA
#else // ALPHA is ONE
#define STORE16_COMPLETE_RESULT STORE16_COMPLETE_RESULT_DIRECT
#define STORE16_MASK_COMPLETE_RESULT STORE16_MASK_COMPLETE_RESULT_DIRECT
#define STORE8_COMPLETE_RESULT STORE8_COMPLETE_RESULT_DIRECT
#define STORE8_MASK_COMPLETE_RESULT STORE8_MASK_COMPLETE_RESULT_DIRECT
#define STORE4_COMPLETE_RESULT STORE4_COMPLETE_RESULT_DIRECT
#define STORE4_MASK_COMPLETE_RESULT STORE4_MASK_COMPLETE_RESULT_DIRECT
#endif
#endif
// 8 rows parallel processing BF16 GEMV kernel for big N && lda effective scenario (process before interleave)
#ifndef ZERO_BETA
#ifndef ONE_BETA
static int sbgemv_kernel_32xN_lda_direct_alpha_beta(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
#else
static int sbgemv_kernel_32xN_lda_direct_alpha_one(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
#endif
#else
#ifndef ONE_ALPHA
static int sbgemv_kernel_32xN_lda_direct_alpha(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
#else
static int sbgemv_kernel_32xN_lda_direct(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float *y)
#endif
#endif
{
BLASLONG tag_m_32x = m & (~31);
BLASLONG tag_m_128x = m & (~127);
__m512 accum512_0, accum512_1, accum512_2, accum512_3, accum512_4, accum512_5, accum512_6, accum512_7, \
accum512_8, accum512_9, accum512_10, accum512_11, accum512_12, accum512_13, accum512_14, accum512_15;
#ifndef ONE_ALPHA
__m512 ALPHAVECTOR = _mm512_set1_ps(alpha);
#endif
#ifndef ZERO_BETA
__m512 BETAVECTOR = _mm512_set1_ps(beta);
#endif
__m512i matrixArray_seed_0, matrixArray_seed_1, matrixArray_seed_2, matrixArray_seed_3;
__m512i matrixArray_0, matrixArray_1, matrixArray_2, matrixArray_3, matrixArray_4, matrixArray_5, matrixArray_6, matrixArray_7;
__m512i xArray_0;
__m512i ZERO512 = _mm512_setzero_si512();
unsigned int blend_hi_mask_value = ((unsigned int)0xaaaaaaaa);
__mmask32 blend_hi_mask = *((__mmask32*) &blend_hi_mask_value);
unsigned int blend_lo_mask_value = ((unsigned int)0x55555555);
__mmask32 blend_lo_mask = *((__mmask32*) &blend_lo_mask_value);
__m512i M512_EPI32_8 = _mm512_set1_epi32(8);
__m512i idx_base_0 = _mm512_set_epi32(23, 7, 22, 6, 21, 5, 20, 4, 19, 3, 18, 2, 17, 1, 16, 0);
__m512i idx_base_1 = _mm512_add_epi32(idx_base_0, M512_EPI32_8);
for (BLASLONG idx_m = 0; idx_m < tag_m_128x; idx_m+=128) {
accum512_0 = _mm512_setzero_ps();
accum512_1 = _mm512_setzero_ps();
accum512_2 = _mm512_setzero_ps();
accum512_3 = _mm512_setzero_ps();
accum512_4 = _mm512_setzero_ps();
accum512_5 = _mm512_setzero_ps();
accum512_6 = _mm512_setzero_ps();
accum512_7 = _mm512_setzero_ps();
for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
xArray_0 = _mm512_set1_epi16(x[idx_n]);
BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m + 0)
BF16_MATRIX_LOAD_1x32(matrixArray_seed_1, a, lda, idx_n, idx_m + 32)
BF16_MATRIX_LOAD_1x32(matrixArray_seed_2, a, lda, idx_n, idx_m + 64)
BF16_MATRIX_LOAD_1x32(matrixArray_seed_3, a, lda, idx_n, idx_m + 96)
matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
matrixArray_2 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_1);
matrixArray_3 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_1);
matrixArray_4 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_2);
matrixArray_5 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_2);
matrixArray_6 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_3);
matrixArray_7 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_3);
BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
BF16_DOT_1x32(accum512_2, matrixArray_2, xArray_0)
BF16_DOT_1x32(accum512_3, matrixArray_3, xArray_0)
BF16_DOT_1x32(accum512_4, matrixArray_4, xArray_0)
BF16_DOT_1x32(accum512_5, matrixArray_5, xArray_0)
BF16_DOT_1x32(accum512_6, matrixArray_6, xArray_0)
BF16_DOT_1x32(accum512_7, matrixArray_7, xArray_0)
}
accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
accum512_10 = _mm512_permutex2var_ps(accum512_2, idx_base_0, accum512_3);
accum512_11 = _mm512_permutex2var_ps(accum512_2, idx_base_1, accum512_3);
accum512_12 = _mm512_permutex2var_ps(accum512_4, idx_base_0, accum512_5);
accum512_13 = _mm512_permutex2var_ps(accum512_4, idx_base_1, accum512_5);
accum512_14 = _mm512_permutex2var_ps(accum512_6, idx_base_0, accum512_7);
accum512_15 = _mm512_permutex2var_ps(accum512_6, idx_base_1, accum512_7);
STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0)
STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16)
STORE16_COMPLETE_RESULT(accum512_10, y+idx_m+32)
STORE16_COMPLETE_RESULT(accum512_11, y+idx_m+48)
STORE16_COMPLETE_RESULT(accum512_12, y+idx_m+64)
STORE16_COMPLETE_RESULT(accum512_13, y+idx_m+80)
STORE16_COMPLETE_RESULT(accum512_14, y+idx_m+96)
STORE16_COMPLETE_RESULT(accum512_15, y+idx_m+112)
}
for (BLASLONG idx_m = tag_m_128x; idx_m < tag_m_32x; idx_m+=32) {
accum512_0 = _mm512_setzero_ps();
accum512_1 = _mm512_setzero_ps();
for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
xArray_0 = _mm512_set1_epi16(x[idx_n]);
BF16_MATRIX_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, idx_m)
matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
}
accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
STORE16_COMPLETE_RESULT(accum512_8, y+idx_m+0)
STORE16_COMPLETE_RESULT(accum512_9, y+idx_m+16)
}
if (tag_m_32x != m) {
unsigned int tail_mask_value = (((unsigned int)0xffffffff) >> (32-(m&31)));
__mmask32 tail_mask = *((__mmask32*) &tail_mask_value);
unsigned short store_tail_mask_value = (((unsigned int)0xffff) >> (16-(m&15)));
__mmask32 store_tail_mask = *((__mmask32*) &store_tail_mask_value);
accum512_0 = _mm512_setzero_ps();
accum512_1 = _mm512_setzero_ps();
for (BLASLONG idx_n = 0; idx_n < n; idx_n++) {
xArray_0 = _mm512_set1_epi16(x[idx_n]);
BF16_MATRIX_MASKZ_LOAD_1x32(matrixArray_seed_0, a, lda, idx_n, tag_m_32x, tail_mask)
matrixArray_0 = _mm512_mask_blend_epi16(blend_lo_mask, ZERO512, matrixArray_seed_0);
matrixArray_1 = _mm512_mask_blend_epi16(blend_hi_mask, ZERO512, matrixArray_seed_0);
BF16_DOT_1x32(accum512_0, matrixArray_0, xArray_0)
BF16_DOT_1x32(accum512_1, matrixArray_1, xArray_0)
}
accum512_8 = _mm512_permutex2var_ps(accum512_0, idx_base_0, accum512_1);
accum512_9 = _mm512_permutex2var_ps(accum512_0, idx_base_1, accum512_1);
if ((m-tag_m_32x) > 16) {
STORE16_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0)
STORE16_MASK_COMPLETE_RESULT(accum512_9, y+tag_m_32x+16, store_tail_mask)
} else {
STORE16_MASK_COMPLETE_RESULT(accum512_8, y+tag_m_32x+0, store_tail_mask)
}
}
return 0;
}

142
kernel/x86_64/sbgemv_t.c Normal file
View File

@ -0,0 +1,142 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#include "common.h"
#if defined (COOPERLAKE)
#include "sbgemv_t_microk_cooperlake.c"
#endif
#define ALIGN64_ALLOC(alloc_size, TYPE, ptr_align, ptr) \
ptr = (TYPE *) malloc(sizeof(TYPE)*alloc_size + 63); \
ptr_align = ((int)(((uintptr_t)ptr & (uintptr_t)0x3F))!=0) ? (TYPE *)((char *)ptr + (64 - (int)((uintptr_t)ptr & (uintptr_t)0x3F))) : ptr
#define ALIGN64_FREE(ptr) \
free(ptr)
#ifndef HAVE_SBGEMV_T_ACCL_KERNEL
static void sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
{
BLASLONG offset_lda, offset_n;
float accum = 0.0;
bfloat16 * a_bf16 = malloc(sizeof(bfloat16)*m*n);
float * a_fp32 = malloc(sizeof(float)*m*n);
float * x_fp32 = malloc(sizeof(float)*n);
for (BLASLONG i=0; i<m; i++) {
offset_lda = lda * i;
offset_n = n * i;
for (BLASLONG j=0; j<n; j++) {
a_bf16[offset_n + j] = a[offset_lda + j];
}
}
SBF16TOS_K(n, x, 1, x_fp32, 1);
SBF16TOS_K(m*n, a_bf16, 1, a_fp32, 1);
for (BLASLONG i=0; i<m; i++) {
offset_n = n * i;
accum = 0.0;
for (BLASLONG j=0; j<n; j++) {
accum += a_fp32[offset_n + j] * x_fp32[j];
}
if (beta == ZERO) {
y[i] = alpha * accum;
} else {
y[i] = alpha * accum + beta * y[i];
}
}
free(a_bf16);
free(a_fp32);
free(x_fp32);
}
#endif
static void bf16_compress_vector(BLASLONG n, bfloat16 * src, bfloat16 * target, BLASLONG inc)
{
for(BLASLONG i=0; i<n; i++) {
target[i] = src[i*inc];
}
}
static void fp32_compress_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
{
for(BLASLONG i=0; i<n; i++) {
target[i] = src[i*inc];
}
}
static void fp32_expand_vector(BLASLONG n, float * src, float * target, BLASLONG inc)
{
for(BLASLONG i=0; i<n; i++) {
target[i*inc] = src[i];
}
}
int CNAME(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, BLASLONG incx, float beta, float * y, BLASLONG incy)
{
if ( m < 1 || n < 1) return(0);
bfloat16 * xbuffer_align = x;
float * ybuffer_align = y;
bfloat16 * xbuffer = NULL;
float * ybuffer = NULL;
// Switch m and n
BLASLONG t = m;
m = n;
n = t;
if (incx != 1) {
ALIGN64_ALLOC(n, bfloat16, xbuffer_align, xbuffer);
bf16_compress_vector(n, x, xbuffer_align, incx);
}
if (incy != 1) {
ALIGN64_ALLOC(m, float, ybuffer_align, ybuffer);
if (beta != ZERO) {
fp32_compress_vector(m, y, ybuffer_align, incy);
}
}
sbgemv_kernel_t(m, n, alpha, a, lda, xbuffer_align, beta, ybuffer_align);
if (incy != 1) {
fp32_expand_vector(m, ybuffer_align, y, incy);
ALIGN64_FREE(ybuffer);
}
if (incx != 1) {
ALIGN64_FREE(xbuffer);
}
return(0);
}

View File

@ -0,0 +1,202 @@
/***************************************************************************
Copyright (c) 2014, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/* need a new enough GCC for avx512 support */
#if (( defined(__GNUC__) && __GNUC__ >= 10 && defined(__AVX512BF16__)) || (defined(__clang__) && __clang_major__ >= 9))
#define HAVE_SBGEMV_T_ACCL_KERNEL 1
// Define micro kernels for ALPHA not ONE && BETA effective && BETA not ONE scenarios
#undef ZERO_BETA
#undef ONE_BETA
#undef ONE_ALPHA
#include "sbgemv_t_microk_cooperlake_template.c"
// Define micro kernels for ALPHA not ONE && BETA as ONE scenarios
#undef ZERO_BETA
#define ONE_BETA 1
#undef ONE_ALPHA
#include "sbgemv_t_microk_cooperlake_template.c"
// Define micro kernels for ALPHA not ONE && BETA in-effective (BETA == 0) scenarios
#define ZERO_BETA 1
#undef ONE_ALPHA
#include "sbgemv_t_microk_cooperlake_template.c"
// Define micro kernels for ALPHA as ONE && BETA in-effective (BETA == 0) scenarios
#define ZERO_BETA 1
#define ONE_ALPHA 1
#include "sbgemv_t_microk_cooperlake_template.c"
static int sbgemv_kernel_t(BLASLONG m, BLASLONG n, float alpha, bfloat16 *a, BLASLONG lda, bfloat16 *x, float beta, float *y)
{
if (beta == ZERO) { // BETA == 0.0, no need to accumulate the original Y data
if (alpha == ONE) { // ALPHA == 1.0, no need to multipy ALPHA
if (n > 127) {
sbgemv_kernel_1x128_lda_direct(m, n, alpha, a, lda, x, y);
} else if (n > 32) {
sbgemv_kernel_8x32_lda_direct(m, n, alpha, a, lda, x, y);
} else {
if (n > 16) {
sbgemv_kernel_8x16p_lda(m, n, alpha, a, lda, x, y);
} else {
if (lda == n) {
switch(n) {
case 1: sbgemv_kernel_32x1 (m, alpha, a, x, y); break;
case 2: sbgemv_kernel_32x2 (m, alpha, a, x, y); break;
case 3: sbgemv_kernel_32x3 (m, alpha, a, x, y); break;
case 4: sbgemv_kernel_16x4 (m, alpha, a, x, y); break;
case 5: sbgemv_kernel_30x5 (m, alpha, a, x, y); break;
case 6: sbgemv_kernel_16x6 (m, alpha, a, x, y); break;
case 7: sbgemv_kernel_16x7 (m, alpha, a, x, y); break;
case 8: sbgemv_kernel_16x8 (m, alpha, a, x, y); break;
case 9: sbgemv_kernel_14x9 (m, alpha, a, x, y); break;
case 10: sbgemv_kernel_12x10(m, alpha, a, x, y); break;
case 11: sbgemv_kernel_15x11(m, alpha, a, x, y); break;
case 12: sbgemv_kernel_15x12(m, alpha, a, x, y); break;
case 13: sbgemv_kernel_16x13(m, alpha, a, x, y); break;
case 14: sbgemv_kernel_16x14(m, alpha, a, x, y); break;
case 15: sbgemv_kernel_16x15(m, alpha, a, x, y); break;
case 16: sbgemv_kernel_16x16(m, alpha, a, x, y); break;
default: break;
}
} else {
sbgemv_kernel_8x16m_lda(m, n, alpha, a, lda, x, y);
}
}
}
} else { // ALPHA != 1.0, need to multipy ALPHA
if (n > 127) {
sbgemv_kernel_1x128_lda_direct_alpha(m, n, alpha, a, lda, x, y);
} else if (n > 32) {
sbgemv_kernel_8x32_lda_direct_alpha(m, n, alpha, a, lda, x, y);
} else {
if (n > 16) {
sbgemv_kernel_8x16p_lda_alpha(m, n, alpha, a, lda, x, y);
} else {
if (lda == n) {
switch(n) {
case 1: sbgemv_kernel_32x1_alpha (m, alpha, a, x, y); break;
case 2: sbgemv_kernel_32x2_alpha (m, alpha, a, x, y); break;
case 3: sbgemv_kernel_32x3_alpha (m, alpha, a, x, y); break;
case 4: sbgemv_kernel_16x4_alpha (m, alpha, a, x, y); break;
case 5: sbgemv_kernel_30x5_alpha (m, alpha, a, x, y); break;
case 6: sbgemv_kernel_16x6_alpha (m, alpha, a, x, y); break;
case 7: sbgemv_kernel_16x7_alpha (m, alpha, a, x, y); break;
case 8: sbgemv_kernel_16x8_alpha (m, alpha, a, x, y); break;
case 9: sbgemv_kernel_14x9_alpha (m, alpha, a, x, y); break;
case 10: sbgemv_kernel_12x10_alpha(m, alpha, a, x, y); break;
case 11: sbgemv_kernel_15x11_alpha(m, alpha, a, x, y); break;
case 12: sbgemv_kernel_15x12_alpha(m, alpha, a, x, y); break;
case 13: sbgemv_kernel_16x13_alpha(m, alpha, a, x, y); break;
case 14: sbgemv_kernel_16x14_alpha(m, alpha, a, x, y); break;
case 15: sbgemv_kernel_16x15_alpha(m, alpha, a, x, y); break;
case 16: sbgemv_kernel_16x16_alpha(m, alpha, a, x, y); break;
default: break;
}
} else {
sbgemv_kernel_8x16m_lda_alpha(m, n, alpha, a, lda, x, y);
}
}
}
}
} else { // BETA != 0.0, need to accumulate the original Y data no matter what ALPHA is
if (beta == ONE) {
if (n > 127) {
sbgemv_kernel_1x128_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
} else if (n > 32) {
sbgemv_kernel_8x32_lda_direct_alpha_one(m, n, alpha, a, lda, x, beta, y);
} else {
if (n > 16) {
sbgemv_kernel_8x16p_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
} else {
if (lda == n) {
switch(n) {
case 1: sbgemv_kernel_32x1_alpha_one (m, alpha, a, x, beta, y); break;
case 2: sbgemv_kernel_32x2_alpha_one (m, alpha, a, x, beta, y); break;
case 3: sbgemv_kernel_32x3_alpha_one (m, alpha, a, x, beta, y); break;
case 4: sbgemv_kernel_16x4_alpha_one (m, alpha, a, x, beta, y); break;
case 5: sbgemv_kernel_30x5_alpha_one (m, alpha, a, x, beta, y); break;
case 6: sbgemv_kernel_16x6_alpha_one (m, alpha, a, x, beta, y); break;
case 7: sbgemv_kernel_16x7_alpha_one (m, alpha, a, x, beta, y); break;
case 8: sbgemv_kernel_16x8_alpha_one (m, alpha, a, x, beta, y); break;
case 9: sbgemv_kernel_14x9_alpha_one (m, alpha, a, x, beta, y); break;
case 10: sbgemv_kernel_12x10_alpha_one(m, alpha, a, x, beta, y); break;
case 11: sbgemv_kernel_15x11_alpha_one(m, alpha, a, x, beta, y); break;
case 12: sbgemv_kernel_15x12_alpha_one(m, alpha, a, x, beta, y); break;
case 13: sbgemv_kernel_16x13_alpha_one(m, alpha, a, x, beta, y); break;
case 14: sbgemv_kernel_16x14_alpha_one(m, alpha, a, x, beta, y); break;
case 15: sbgemv_kernel_16x15_alpha_one(m, alpha, a, x, beta, y); break;
case 16: sbgemv_kernel_16x16_alpha_one(m, alpha, a, x, beta, y); break;
default: break;
}
} else {
sbgemv_kernel_8x16m_lda_alpha_one(m, n, alpha, a, lda, x, beta, y);
}
}
}
} else {
if (n > 127) {
sbgemv_kernel_1x128_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
} else if (n > 32) {
sbgemv_kernel_8x32_lda_direct_alpha_beta(m, n, alpha, a, lda, x, beta, y);
} else {
if (n > 16) {
sbgemv_kernel_8x16p_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
} else {
if (lda == n) {
switch(n) {
case 1: sbgemv_kernel_32x1_alpha_beta (m, alpha, a, x, beta, y); break;
case 2: sbgemv_kernel_32x2_alpha_beta (m, alpha, a, x, beta, y); break;
case 3: sbgemv_kernel_32x3_alpha_beta (m, alpha, a, x, beta, y); break;
case 4: sbgemv_kernel_16x4_alpha_beta (m, alpha, a, x, beta, y); break;
case 5: sbgemv_kernel_30x5_alpha_beta (m, alpha, a, x, beta, y); break;
case 6: sbgemv_kernel_16x6_alpha_beta (m, alpha, a, x, beta, y); break;
case 7: sbgemv_kernel_16x7_alpha_beta (m, alpha, a, x, beta, y); break;
case 8: sbgemv_kernel_16x8_alpha_beta (m, alpha, a, x, beta, y); break;
case 9: sbgemv_kernel_14x9_alpha_beta (m, alpha, a, x, beta, y); break;
case 10: sbgemv_kernel_12x10_alpha_beta(m, alpha, a, x, beta, y); break;
case 11: sbgemv_kernel_15x11_alpha_beta(m, alpha, a, x, beta, y); break;
case 12: sbgemv_kernel_15x12_alpha_beta(m, alpha, a, x, beta, y); break;
case 13: sbgemv_kernel_16x13_alpha_beta(m, alpha, a, x, beta, y); break;
case 14: sbgemv_kernel_16x14_alpha_beta(m, alpha, a, x, beta, y); break;
case 15: sbgemv_kernel_16x15_alpha_beta(m, alpha, a, x, beta, y); break;
case 16: sbgemv_kernel_16x16_alpha_beta(m, alpha, a, x, beta, y); break;
default: break;
}
} else {
sbgemv_kernel_8x16m_lda_alpha_beta(m, n, alpha, a, lda, x, beta, y);
}
}
}
}
}
return 0;
}
#endif

File diff suppressed because it is too large Load Diff