Merge pull request #390 from wernsaar/develop
Ref #103: enhancement for small matrix dimensions. Fixed some bugs. Enable sgemm for SNB and dgemm for NEHALEM
This commit is contained in:
commit
d10db52edb
4
Makefile
4
Makefile
|
@ -36,9 +36,13 @@ ifndef BINARY64
|
||||||
else
|
else
|
||||||
@echo " BINARY ... 64bit "
|
@echo " BINARY ... 64bit "
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
|
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
|
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
|
||||||
ifndef NOFORTRAN
|
ifndef NOFORTRAN
|
||||||
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
|
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"
|
||||||
|
|
|
@ -133,7 +133,8 @@ NO_AFFINITY = 1
|
||||||
# COMMON_OPT = -O2
|
# COMMON_OPT = -O2
|
||||||
|
|
||||||
# gfortran option for LAPACK
|
# gfortran option for LAPACK
|
||||||
FCOMMON_OPT = -frecursive
|
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
|
||||||
|
# FCOMMON_OPT = -frecursive
|
||||||
|
|
||||||
# Profiling flags
|
# Profiling flags
|
||||||
COMMON_PROF = -pg
|
COMMON_PROF = -pg
|
||||||
|
|
|
@ -46,15 +46,55 @@ ifdef TARGET
|
||||||
GETARCH_FLAGS := -DFORCE_$(TARGET)
|
GETARCH_FLAGS := -DFORCE_$(TARGET)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# Force fallbacks for 32bit
|
||||||
|
|
||||||
|
ifeq ($(BINARY), 32)
|
||||||
|
ifeq ($(TARGET), HASWELL)
|
||||||
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
|
endif
|
||||||
|
ifeq ($(TARGET), SANDYBRIDGE)
|
||||||
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
|
endif
|
||||||
|
ifeq ($(TARGET), BULLDOZER)
|
||||||
|
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||||
|
endif
|
||||||
|
ifeq ($(TARGET), PILEDRIVER)
|
||||||
|
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
|
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
|
||||||
#
|
#
|
||||||
ifdef TARGET_CORE
|
ifdef TARGET_CORE
|
||||||
GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
|
GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
# Force fallbacks for 32bit
|
||||||
|
|
||||||
|
ifeq ($(BINARY), 32)
|
||||||
|
ifeq ($(TARGET_CORE), HASWELL)
|
||||||
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
|
endif
|
||||||
|
ifeq ($(TARGET_CORE), SANDYBRIDGE)
|
||||||
|
GETARCH_FLAGS := -DFORCE_NEHALEM
|
||||||
|
endif
|
||||||
|
ifeq ($(TARGET_CORE), BULLDOZER)
|
||||||
|
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||||
|
endif
|
||||||
|
ifeq ($(TARGET_CORE), PILEDRIVER)
|
||||||
|
GETARCH_FLAGS := -DFORCE_BARCELONA
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
GETARCH_FLAGS += -DUSE64BITINT
|
GETARCH_FLAGS += -DUSE64BITINT
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef GEMM_MULTITHREAD_THRESHOLD
|
ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||||
GEMM_MULTITHREAD_THRESHOLD=4
|
GEMM_MULTITHREAD_THRESHOLD=4
|
||||||
|
@ -65,6 +105,10 @@ ifeq ($(NO_AVX), 1)
|
||||||
GETARCH_FLAGS += -DNO_AVX
|
GETARCH_FLAGS += -DNO_AVX
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(BINARY), 32)
|
||||||
|
GETARCH_FLAGS += -DNO_AVX
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(DEBUG), 1)
|
ifeq ($(DEBUG), 1)
|
||||||
GETARCH_FLAGS += -g
|
GETARCH_FLAGS += -g
|
||||||
endif
|
endif
|
||||||
|
@ -336,9 +380,6 @@ ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
ifeq ($(ARCH), x86)
|
ifeq ($(ARCH), x86)
|
||||||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
||||||
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
ifneq ($(NO_AVX), 1)
|
|
||||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
|
|
||||||
endif
|
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
|
@ -503,8 +544,10 @@ else
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
FCOMMON_OPT += -m64
|
FCOMMON_OPT += -m64
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
FCOMMON_OPT += -fdefault-integer-8
|
FCOMMON_OPT += -fdefault-integer-8
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
FCOMMON_OPT += -m32
|
FCOMMON_OPT += -m32
|
||||||
endif
|
endif
|
||||||
|
@ -517,8 +560,10 @@ endif
|
||||||
ifeq ($(F_COMPILER), INTEL)
|
ifeq ($(F_COMPILER), INTEL)
|
||||||
CCOMMON_OPT += -DF_INTERFACE_INTEL
|
CCOMMON_OPT += -DF_INTERFACE_INTEL
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
FCOMMON_OPT += -i8
|
FCOMMON_OPT += -i8
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
ifdef USE_OPENMP
|
ifdef USE_OPENMP
|
||||||
FCOMMON_OPT += -openmp
|
FCOMMON_OPT += -openmp
|
||||||
endif
|
endif
|
||||||
|
@ -537,8 +582,10 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
FCOMMON_OPT += -q64
|
FCOMMON_OPT += -q64
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
FCOMMON_OPT += -qintsize=8
|
FCOMMON_OPT += -qintsize=8
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
FCOMMON_OPT += -q32
|
FCOMMON_OPT += -q32
|
||||||
endif
|
endif
|
||||||
|
@ -552,8 +599,10 @@ CCOMMON_OPT += -DF_INTERFACE_PGI
|
||||||
COMMON_PROF += -DPGICOMPILER
|
COMMON_PROF += -DPGICOMPILER
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
FCOMMON_OPT += -i8
|
FCOMMON_OPT += -i8
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
FCOMMON_OPT += -tp p7-64
|
FCOMMON_OPT += -tp p7-64
|
||||||
else
|
else
|
||||||
FCOMMON_OPT += -tp p7
|
FCOMMON_OPT += -tp p7
|
||||||
|
@ -567,9 +616,11 @@ ifeq ($(F_COMPILER), PATHSCALE)
|
||||||
CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
|
CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
FCOMMON_OPT += -i8
|
FCOMMON_OPT += -i8
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifneq ($(ARCH), mips64)
|
ifneq ($(ARCH), mips64)
|
||||||
ifndef BINARY64
|
ifndef BINARY64
|
||||||
|
@ -594,9 +645,11 @@ ifeq ($(F_COMPILER), OPEN64)
|
||||||
CCOMMON_OPT += -DF_INTERFACE_OPEN64
|
CCOMMON_OPT += -DF_INTERFACE_OPEN64
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
FCOMMON_OPT += -i8
|
FCOMMON_OPT += -i8
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), mips64)
|
ifeq ($(ARCH), mips64)
|
||||||
ifndef BINARY64
|
ifndef BINARY64
|
||||||
|
@ -682,10 +735,12 @@ endif
|
||||||
|
|
||||||
ifdef BINARY64
|
ifdef BINARY64
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
CCOMMON_OPT +=
|
CCOMMON_OPT +=
|
||||||
#-DUSE64BITINT
|
#-DUSE64BITINT
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(NEED_PIC), 1)
|
ifeq ($(NEED_PIC), 1)
|
||||||
ifeq ($(C_COMPILER), IBM)
|
ifeq ($(C_COMPILER), IBM)
|
||||||
|
@ -718,6 +773,10 @@ ifeq ($(NO_AVX), 1)
|
||||||
CCOMMON_OPT += -DNO_AVX
|
CCOMMON_OPT += -DNO_AVX
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(BINARY), 32)
|
||||||
|
CCOMMON_OPT += -DNO_AVX
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef SMP
|
ifdef SMP
|
||||||
CCOMMON_OPT += -DSMP_SERVER
|
CCOMMON_OPT += -DSMP_SERVER
|
||||||
|
|
||||||
|
@ -872,8 +931,11 @@ endif
|
||||||
LAPACK_CFLAGS = $(CFLAGS)
|
LAPACK_CFLAGS = $(CFLAGS)
|
||||||
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
|
||||||
ifdef INTERFACE64
|
ifdef INTERFACE64
|
||||||
|
ifneq ($(INTERFACE64), 0)
|
||||||
LAPACK_CFLAGS += -DLAPACK_ILP64
|
LAPACK_CFLAGS += -DLAPACK_ILP64
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifdef OS_WINDOWS
|
ifdef OS_WINDOWS
|
||||||
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
|
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
|
||||||
endif
|
endif
|
||||||
|
|
|
@ -40,6 +40,7 @@
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
#include "cpuid.h"
|
#include "cpuid.h"
|
||||||
|
|
||||||
|
/*
|
||||||
#ifdef NO_AVX
|
#ifdef NO_AVX
|
||||||
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
|
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
|
||||||
#define CORE_HASWELL CORE_NEHALEM
|
#define CORE_HASWELL CORE_NEHALEM
|
||||||
|
@ -50,6 +51,7 @@
|
||||||
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
|
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
|
||||||
#define CORE_PILEDRIVER CORE_BARCELONA
|
#define CORE_PILEDRIVER CORE_BARCELONA
|
||||||
#endif
|
#endif
|
||||||
|
*/
|
||||||
|
|
||||||
#ifndef CPUIDEMU
|
#ifndef CPUIDEMU
|
||||||
|
|
||||||
|
|
|
@ -39,7 +39,7 @@
|
||||||
#include "common.h"
|
#include "common.h"
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMP
|
||||||
#ifndef USE64BITINT
|
#if !defined(USE64BITINT) || defined(ARCH_X86)
|
||||||
unsigned int blas_quick_divide_table[] = {
|
unsigned int blas_quick_divide_table[] = {
|
||||||
0x00000000, 0x00000001, 0x80000001, 0x55555556,
|
0x00000000, 0x00000001, 0x80000001, 0x55555556,
|
||||||
0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925,
|
0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925,
|
||||||
|
|
|
@ -72,7 +72,7 @@
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef GEMM_MULTITHREAD_THRESHOLD
|
#ifndef GEMM_MULTITHREAD_THRESHOLD
|
||||||
# define GEMM_MULTITHREAD_THRESHOLD 4
|
#define GEMM_MULTITHREAD_THRESHOLD 4
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
static int (*gemm[])(blas_arg_t *, BLASLONG *, BLASLONG *, FLOAT *, FLOAT *, BLASLONG) = {
|
||||||
|
@ -400,14 +400,63 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
|
||||||
mode |= (transa << BLAS_TRANSA_SHIFT);
|
mode |= (transa << BLAS_TRANSA_SHIFT);
|
||||||
mode |= (transb << BLAS_TRANSB_SHIFT);
|
mode |= (transb << BLAS_TRANSB_SHIFT);
|
||||||
|
|
||||||
|
int nthreads_max = num_cpu_avail(3);
|
||||||
|
int nthreads_avail = nthreads_max;
|
||||||
|
|
||||||
|
#ifndef COMPLEX
|
||||||
|
double MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||||
|
if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||||
|
nthreads_max = 1;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||||
|
{
|
||||||
|
nthreads_max = 4;
|
||||||
|
if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )
|
||||||
|
{
|
||||||
|
nthreads_max = 2;
|
||||||
|
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
|
||||||
|
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
|
||||||
|
if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
double MNK = (double) args.m * (double) args.n * (double) args.k;
|
||||||
|
if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||||
|
nthreads_max = 1;
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
|
||||||
|
{
|
||||||
|
nthreads_max = 4;
|
||||||
|
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD )
|
||||||
|
{
|
||||||
|
nthreads_max = 2;
|
||||||
|
if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
|
||||||
|
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
|
||||||
|
if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
args.common = NULL;
|
args.common = NULL;
|
||||||
|
|
||||||
if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
|
if ( nthreads_max > nthreads_avail )
|
||||||
|| args.k <=GEMM_MULTITHREAD_THRESHOLD){
|
args.nthreads = nthreads_avail;
|
||||||
args.nthreads = 1;
|
else
|
||||||
}else{
|
args.nthreads = nthreads_max;
|
||||||
args.nthreads = num_cpu_avail(3);
|
|
||||||
}
|
|
||||||
if (args.nthreads == 1) {
|
if (args.nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -75,7 +75,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
|
||||||
blasint incy = *INCY;
|
blasint incy = *INCY;
|
||||||
blasint lda = *LDA;
|
blasint lda = *LDA;
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
#ifdef SMP
|
#ifdef SMPBUG
|
||||||
int nthreads;
|
int nthreads;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -107,7 +107,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
blasint info, t;
|
blasint info, t;
|
||||||
#ifdef SMP
|
#ifdef SMPBUG
|
||||||
int nthreads;
|
int nthreads;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -167,15 +167,16 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMPBUG
|
||||||
nthreads = num_cpu_avail(2);
|
nthreads = num_cpu_avail(2);
|
||||||
|
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer);
|
GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer);
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMPBUG
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads);
|
GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads);
|
||||||
|
|
|
@ -62,7 +62,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp;
|
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;
|
||||||
|
|
||||||
if(*dd1 < ZERO)
|
if(*dd1 < ZERO)
|
||||||
{
|
{
|
||||||
|
|
|
@ -109,7 +109,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
|
||||||
blasint incy = *INCY;
|
blasint incy = *INCY;
|
||||||
blasint lda = *LDA;
|
blasint lda = *LDA;
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
#ifdef SMP
|
#ifdef SMPBUG
|
||||||
int nthreads;
|
int nthreads;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
FLOAT *buffer;
|
FLOAT *buffer;
|
||||||
blasint info, t;
|
blasint info, t;
|
||||||
#ifdef SMP
|
#ifdef SMPBUG
|
||||||
int nthreads;
|
int nthreads;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -205,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
|
|
||||||
buffer = (FLOAT *)blas_memory_alloc(1);
|
buffer = (FLOAT *)blas_memory_alloc(1);
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMPBUG
|
||||||
nthreads = num_cpu_avail(2);
|
nthreads = num_cpu_avail(2);
|
||||||
|
|
||||||
if (nthreads == 1) {
|
if (nthreads == 1) {
|
||||||
|
@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifdef SMP
|
#ifdef SMPBUG
|
||||||
|
|
||||||
} else {
|
} else {
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
SGEMVNKERNEL = sgemv_n.S
|
||||||
|
SGEMVTKERNEL = sgemv_t.S
|
||||||
|
|
||||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
SGEMVNKERNEL = sgemv_n.S
|
||||||
|
SGEMVTKERNEL = sgemv_t.S
|
||||||
|
|
||||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||||
|
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
SGEMVNKERNEL = sgemv_n.S
|
||||||
|
SGEMVTKERNEL = sgemv_t.S
|
||||||
|
|
||||||
|
|
||||||
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
|
||||||
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
|
|
|
@ -1,3 +1,7 @@
|
||||||
|
SGEMVNKERNEL = sgemv_n.S
|
||||||
|
SGEMVTKERNEL = sgemv_t.S
|
||||||
|
|
||||||
|
|
||||||
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
|
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
|
||||||
SGEMMINCOPY = gemm_ncopy_4.S
|
SGEMMINCOPY = gemm_ncopy_4.S
|
||||||
SGEMMITCOPY = gemm_tcopy_4.S
|
SGEMMITCOPY = gemm_tcopy_4.S
|
||||||
|
@ -9,13 +13,13 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
DGEMMKERNEL = gemm_kernel_4x4_core2.S
|
DGEMMKERNEL = gemm_kernel_2x8_nehalem.S
|
||||||
DGEMMINCOPY =
|
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||||
DGEMMITCOPY =
|
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||||
DGEMMONCOPY = gemm_ncopy_4.S
|
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
||||||
DGEMMOTCOPY = gemm_tcopy_4.S
|
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
||||||
DGEMMINCOPYOBJ =
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMITCOPYOBJ =
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
@ -44,11 +48,10 @@ STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
|
||||||
STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
|
STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
|
||||||
STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
|
STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
|
||||||
|
|
||||||
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S
|
DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
|
||||||
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S
|
DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
|
||||||
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S
|
DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
|
||||||
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S
|
DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
|
||||||
|
|
||||||
|
|
||||||
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
|
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
|
||||||
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
|
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S
|
||||||
|
|
|
@ -1,3 +1,6 @@
|
||||||
|
SGEMVNKERNEL = sgemv_n.S
|
||||||
|
SGEMVTKERNEL = sgemv_t.S
|
||||||
|
|
||||||
ZGEMVNKERNEL = zgemv_n_dup.S
|
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||||
ZGEMVTKERNEL = zgemv_t_dup.S
|
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||||
|
|
||||||
|
|
|
@ -1,14 +1,16 @@
|
||||||
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
|
SGEMVNKERNEL = sgemv_n.S
|
||||||
SGEMMINCOPY = gemm_ncopy_4.S
|
SGEMVTKERNEL = sgemv_t.S
|
||||||
SGEMMITCOPY = gemm_tcopy_4.S
|
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
|
SGEMMKERNEL = sgemm_kernel_16x4_sandy.S
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
|
||||||
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
|
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
|
||||||
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
|
||||||
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
DGEMMITCOPY = ../generic/gemm_tcopy_8.c
|
||||||
|
|
|
@ -79,8 +79,7 @@
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 8192
|
||||||
#define LB2_OFFSET 512*8*2
|
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
|
|
@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 256*8*4
|
||||||
#define LB2_OFFSET 512*8*2
|
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define KK 72(%rsp)
|
#define KK 72(%rsp)
|
||||||
#define KKK 80(%rsp)
|
#define KKK 80(%rsp)
|
||||||
#define BUFFER1 128(%rsp)
|
#define BUFFER1 128(%rsp)
|
||||||
#define BUFFER2 LB2_OFFSET+128(%rsp)
|
|
||||||
|
|
||||||
#if defined(OS_WINDOWS)
|
#if defined(OS_WINDOWS)
|
||||||
#if L_BUFFER_SIZE > 16384
|
#if L_BUFFER_SIZE > 16384
|
||||||
|
|
|
@ -93,8 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 8192
|
||||||
#define LB2_OFFSET 512*8*2
|
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
@ -105,7 +104,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define KK 72(%rsp)
|
#define KK 72(%rsp)
|
||||||
#define KKK 80(%rsp)
|
#define KKK 80(%rsp)
|
||||||
#define BUFFER1 128(%rsp)
|
#define BUFFER1 128(%rsp)
|
||||||
#define BUFFER2 LB2_OFFSET+128(%rsp)
|
|
||||||
|
|
||||||
#if defined(OS_WINDOWS)
|
#if defined(OS_WINDOWS)
|
||||||
#if L_BUFFER_SIZE > 16384
|
#if L_BUFFER_SIZE > 16384
|
||||||
|
|
|
@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#else
|
#else
|
||||||
|
|
||||||
#define STACKSIZE 256
|
#define STACKSIZE 256
|
||||||
#define L_BUFFER_SIZE 128*8*12+4096
|
#define L_BUFFER_SIZE 128*8*12+512
|
||||||
|
|
||||||
#define OLD_A 40 + STACKSIZE(%rsp)
|
#define OLD_A 40 + STACKSIZE(%rsp)
|
||||||
#define OLD_B 48 + STACKSIZE(%rsp)
|
#define OLD_B 48 + STACKSIZE(%rsp)
|
||||||
|
|
|
@ -148,8 +148,8 @@
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 8192
|
||||||
#define LB2_OFFSET 512*8*2
|
#define LB2_OFFSET 4096
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
|
|
@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 8192
|
||||||
#define LB2_OFFSET 512*8*2
|
#define LB2_OFFSET 4096
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
|
|
@ -78,8 +78,8 @@
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 8192
|
||||||
#define LB2_OFFSET 512*8*2
|
#define LB2_OFFSET 4096
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
|
|
@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 8192
|
||||||
#define LB2_OFFSET 512*8*2
|
#define LB2_OFFSET 4096
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
|
|
@ -90,8 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 8192
|
||||||
#define LB2_OFFSET 512*8*2
|
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
@ -101,7 +100,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define KK 64(%rsp)
|
#define KK 64(%rsp)
|
||||||
#define KKK 72(%rsp)
|
#define KKK 72(%rsp)
|
||||||
#define BUFFER1 128(%rsp)
|
#define BUFFER1 128(%rsp)
|
||||||
#define BUFFER2 LB2_OFFSET+128(%rsp)
|
|
||||||
|
|
||||||
#if defined(OS_WINDOWS)
|
#if defined(OS_WINDOWS)
|
||||||
#if L_BUFFER_SIZE > 16384
|
#if L_BUFFER_SIZE > 16384
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -79,8 +79,7 @@
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 8192
|
||||||
#define LB2_OFFSET 512*8*2
|
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
@ -91,7 +90,6 @@
|
||||||
#define KK 72(%rsp)
|
#define KK 72(%rsp)
|
||||||
#define KKK 80(%rsp)
|
#define KKK 80(%rsp)
|
||||||
#define BUFFER1 128(%rsp)
|
#define BUFFER1 128(%rsp)
|
||||||
#define BUFFER2 LB2_OFFSET+128(%rsp)
|
|
||||||
|
|
||||||
#if defined(OS_WINDOWS)
|
#if defined(OS_WINDOWS)
|
||||||
#if L_BUFFER_SIZE > 16384
|
#if L_BUFFER_SIZE > 16384
|
||||||
|
|
|
@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 256*8*4
|
||||||
#define LB2_OFFSET 512*8*2
|
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define KK 72(%rsp)
|
#define KK 72(%rsp)
|
||||||
#define KKK 80(%rsp)
|
#define KKK 80(%rsp)
|
||||||
#define BUFFER1 128(%rsp)
|
#define BUFFER1 128(%rsp)
|
||||||
#define BUFFER2 LB2_OFFSET+128(%rsp)
|
|
||||||
|
|
||||||
#if defined(OS_WINDOWS)
|
#if defined(OS_WINDOWS)
|
||||||
#if L_BUFFER_SIZE > 16384
|
#if L_BUFFER_SIZE > 16384
|
||||||
|
|
|
@ -92,8 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define L_BUFFER_SIZE 512*8*4
|
#define L_BUFFER_SIZE 8192
|
||||||
#define LB2_OFFSET 512*8*2
|
|
||||||
|
|
||||||
#define Ndiv6 24(%rsp)
|
#define Ndiv6 24(%rsp)
|
||||||
#define Nmod6 32(%rsp)
|
#define Nmod6 32(%rsp)
|
||||||
|
@ -104,7 +103,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define KK 72(%rsp)
|
#define KK 72(%rsp)
|
||||||
#define KKK 80(%rsp)
|
#define KKK 80(%rsp)
|
||||||
#define BUFFER1 128(%rsp)
|
#define BUFFER1 128(%rsp)
|
||||||
#define BUFFER2 LB2_OFFSET+128(%rsp)
|
|
||||||
|
|
||||||
#if defined(OS_WINDOWS)
|
#if defined(OS_WINDOWS)
|
||||||
#if L_BUFFER_SIZE > 16384
|
#if L_BUFFER_SIZE > 16384
|
||||||
|
|
|
@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines
|
||||||
0 5 7 3 200 Values of INIBL (nibble crossover point)
|
0 5 7 3 200 Values of INIBL (nibble crossover point)
|
||||||
1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts)
|
1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts)
|
||||||
0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2)
|
0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2)
|
||||||
20.0 Threshold value
|
70.0 Threshold value
|
||||||
T Put T to test the error exits
|
T Put T to test the error exits
|
||||||
1 Code to interpret the seed
|
1 Code to interpret the seed
|
||||||
NEP 21
|
NEP 21
|
||||||
|
|
21
param.h
21
param.h
|
@ -1032,14 +1032,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define XGEMM_DEFAULT_UNROLL_N 1
|
#define XGEMM_DEFAULT_UNROLL_N 1
|
||||||
#else
|
#else
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||||
#define DGEMM_DEFAULT_UNROLL_M 4
|
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||||
#define QGEMM_DEFAULT_UNROLL_M 2
|
#define QGEMM_DEFAULT_UNROLL_M 2
|
||||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||||
#define ZGEMM_DEFAULT_UNROLL_M 1
|
#define ZGEMM_DEFAULT_UNROLL_M 1
|
||||||
#define XGEMM_DEFAULT_UNROLL_M 1
|
#define XGEMM_DEFAULT_UNROLL_M 1
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
#define SGEMM_DEFAULT_UNROLL_N 8
|
||||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
#define DGEMM_DEFAULT_UNROLL_N 8
|
||||||
#define QGEMM_DEFAULT_UNROLL_N 2
|
#define QGEMM_DEFAULT_UNROLL_N 2
|
||||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
@ -1073,6 +1073,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
#define GETRF_FACTOR 0.72
|
#define GETRF_FACTOR 0.72
|
||||||
|
|
||||||
|
#define CGEMM3M_DEFAULT_UNROLL_N 4
|
||||||
|
#define CGEMM3M_DEFAULT_UNROLL_M 8
|
||||||
|
#define ZGEMM3M_DEFAULT_UNROLL_N 2
|
||||||
|
#define ZGEMM3M_DEFAULT_UNROLL_M 8
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
@ -1104,14 +1108,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
#define XGEMM_DEFAULT_UNROLL_N 1
|
#define XGEMM_DEFAULT_UNROLL_N 1
|
||||||
#else
|
#else
|
||||||
#define SGEMM_DEFAULT_UNROLL_M 4
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
#define DGEMM_DEFAULT_UNROLL_M 8
|
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||||
#define QGEMM_DEFAULT_UNROLL_M 2
|
#define QGEMM_DEFAULT_UNROLL_M 2
|
||||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||||
#define XGEMM_DEFAULT_UNROLL_M 1
|
#define XGEMM_DEFAULT_UNROLL_M 1
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_UNROLL_N 8
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||||
#define QGEMM_DEFAULT_UNROLL_N 2
|
#define QGEMM_DEFAULT_UNROLL_N 2
|
||||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
@ -1119,7 +1123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define XGEMM_DEFAULT_UNROLL_N 1
|
#define XGEMM_DEFAULT_UNROLL_N 1
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_P 512
|
#define SGEMM_DEFAULT_P 768
|
||||||
#define SGEMM_DEFAULT_R sgemm_r
|
#define SGEMM_DEFAULT_R sgemm_r
|
||||||
//#define SGEMM_DEFAULT_R 1024
|
//#define SGEMM_DEFAULT_R 1024
|
||||||
|
|
||||||
|
@ -1141,13 +1145,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define XGEMM_DEFAULT_P 252
|
#define XGEMM_DEFAULT_P 252
|
||||||
#define XGEMM_DEFAULT_R xgemm_r
|
#define XGEMM_DEFAULT_R xgemm_r
|
||||||
|
|
||||||
#define SGEMM_DEFAULT_Q 256
|
#define SGEMM_DEFAULT_Q 384
|
||||||
#define DGEMM_DEFAULT_Q 256
|
#define DGEMM_DEFAULT_Q 256
|
||||||
#define QGEMM_DEFAULT_Q 128
|
#define QGEMM_DEFAULT_Q 128
|
||||||
#define CGEMM_DEFAULT_Q 256
|
#define CGEMM_DEFAULT_Q 256
|
||||||
#define ZGEMM_DEFAULT_Q 192
|
#define ZGEMM_DEFAULT_Q 192
|
||||||
#define XGEMM_DEFAULT_Q 128
|
#define XGEMM_DEFAULT_Q 128
|
||||||
|
|
||||||
|
#define CGEMM3M_DEFAULT_UNROLL_N 4
|
||||||
|
#define CGEMM3M_DEFAULT_UNROLL_M 8
|
||||||
|
#define ZGEMM3M_DEFAULT_UNROLL_N 2
|
||||||
|
#define ZGEMM3M_DEFAULT_UNROLL_M 8
|
||||||
|
|
||||||
#define GETRF_FACTOR 0.72
|
#define GETRF_FACTOR 0.72
|
||||||
|
|
||||||
#endif
|
#endif
|
||||||
|
|
Loading…
Reference in New Issue