Merge pull request #390 from wernsaar/develop

Ref #103: enhancement for small matrix dimensions. Fixed some bugs. Enable sgemm for SNB and dgemm for NEHALEM
This commit is contained in:
Zhang Xianyi 2014-06-27 14:57:06 +08:00
commit d10db52edb
30 changed files with 3375 additions and 75 deletions

View File

@ -36,9 +36,13 @@ ifndef BINARY64
else
@echo " BINARY ... 64bit "
endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
@echo " Use 64 bits int (equivalent to \"-i8\" in Fortran) "
endif
endif
@echo " C compiler ... $(C_COMPILER) (command line : $(CC))"
ifndef NOFORTRAN
@echo " Fortran compiler ... $(F_COMPILER) (command line : $(FC))"

View File

@ -133,7 +133,8 @@ NO_AFFINITY = 1
# COMMON_OPT = -O2
# gfortran option for LAPACK
FCOMMON_OPT = -frecursive
# enable this flag only on 64bit Linux and if you need a thread safe lapack library
# FCOMMON_OPT = -frecursive
# Profiling flags
COMMON_PROF = -pg

View File

@ -46,15 +46,55 @@ ifdef TARGET
GETARCH_FLAGS := -DFORCE_$(TARGET)
endif
# Force fallbacks for 32bit
ifeq ($(BINARY), 32)
ifeq ($(TARGET), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
#TARGET_CORE will override TARGET which is used in DYNAMIC_ARCH=1.
#
ifdef TARGET_CORE
GETARCH_FLAGS := -DFORCE_$(TARGET_CORE)
endif
# Force fallbacks for 32bit
ifeq ($(BINARY), 32)
ifeq ($(TARGET_CORE), HASWELL)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), SANDYBRIDGE)
GETARCH_FLAGS := -DFORCE_NEHALEM
endif
ifeq ($(TARGET_CORE), BULLDOZER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
ifeq ($(TARGET_CORE), PILEDRIVER)
GETARCH_FLAGS := -DFORCE_BARCELONA
endif
endif
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
GETARCH_FLAGS += -DUSE64BITINT
endif
endif
ifndef GEMM_MULTITHREAD_THRESHOLD
GEMM_MULTITHREAD_THRESHOLD=4
@ -65,6 +105,10 @@ ifeq ($(NO_AVX), 1)
GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(BINARY), 32)
GETARCH_FLAGS += -DNO_AVX
endif
ifeq ($(DEBUG), 1)
GETARCH_FLAGS += -g
endif
@ -336,9 +380,6 @@ ifeq ($(DYNAMIC_ARCH), 1)
ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL
endif
endif
ifeq ($(ARCH), x86_64)
@ -503,8 +544,10 @@ else
ifdef BINARY64
FCOMMON_OPT += -m64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -fdefault-integer-8
endif
endif
else
FCOMMON_OPT += -m32
endif
@ -517,8 +560,10 @@ endif
ifeq ($(F_COMPILER), INTEL)
CCOMMON_OPT += -DF_INTERFACE_INTEL
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
ifdef USE_OPENMP
FCOMMON_OPT += -openmp
endif
@ -537,8 +582,10 @@ CCOMMON_OPT += -DF_INTERFACE_IBM
ifdef BINARY64
FCOMMON_OPT += -q64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -qintsize=8
endif
endif
else
FCOMMON_OPT += -q32
endif
@ -552,8 +599,10 @@ CCOMMON_OPT += -DF_INTERFACE_PGI
COMMON_PROF += -DPGICOMPILER
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
FCOMMON_OPT += -tp p7-64
else
FCOMMON_OPT += -tp p7
@ -567,9 +616,11 @@ ifeq ($(F_COMPILER), PATHSCALE)
CCOMMON_OPT += -DF_INTERFACE_PATHSCALE
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
endif
ifneq ($(ARCH), mips64)
ifndef BINARY64
@ -594,9 +645,11 @@ ifeq ($(F_COMPILER), OPEN64)
CCOMMON_OPT += -DF_INTERFACE_OPEN64
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
FCOMMON_OPT += -i8
endif
endif
endif
ifeq ($(ARCH), mips64)
ifndef BINARY64
@ -682,10 +735,12 @@ endif
ifdef BINARY64
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
CCOMMON_OPT +=
#-DUSE64BITINT
endif
endif
endif
ifeq ($(NEED_PIC), 1)
ifeq ($(C_COMPILER), IBM)
@ -718,6 +773,10 @@ ifeq ($(NO_AVX), 1)
CCOMMON_OPT += -DNO_AVX
endif
ifeq ($(BINARY), 32)
CCOMMON_OPT += -DNO_AVX
endif
ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
@ -872,8 +931,11 @@ endif
LAPACK_CFLAGS = $(CFLAGS)
LAPACK_CFLAGS += -DHAVE_LAPACK_CONFIG_H
ifdef INTERFACE64
ifneq ($(INTERFACE64), 0)
LAPACK_CFLAGS += -DLAPACK_ILP64
endif
endif
ifdef OS_WINDOWS
LAPACK_CFLAGS += -DOPENBLAS_OS_WINDOWS
endif

View File

@ -40,6 +40,7 @@
#include <string.h>
#include "cpuid.h"
/*
#ifdef NO_AVX
#define CPUTYPE_HASWELL CPUTYPE_NEHALEM
#define CORE_HASWELL CORE_NEHALEM
@ -50,6 +51,7 @@
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA
#endif
*/
#ifndef CPUIDEMU

View File

@ -39,7 +39,7 @@
#include "common.h"
#ifdef SMP
#ifndef USE64BITINT
#if !defined(USE64BITINT) || defined(ARCH_X86)
unsigned int blas_quick_divide_table[] = {
0x00000000, 0x00000001, 0x80000001, 0x55555556,
0x40000001, 0x33333334, 0x2aaaaaab, 0x24924925,

View File

@ -400,14 +400,63 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
mode |= (transa << BLAS_TRANSA_SHIFT);
mode |= (transb << BLAS_TRANSB_SHIFT);
int nthreads_max = num_cpu_avail(3);
int nthreads_avail = nthreads_max;
#ifndef COMPLEX
double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (1024.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
{
if ( MNK <= (65536.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 16 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 3 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
}
#else
double MNK = (double) args.m * (double) args.n * (double) args.k;
if ( MNK <= (256.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
nthreads_max = 1;
else
{
if ( MNK <= (16384.0 * (double) GEMM_MULTITHREAD_THRESHOLD) )
{
nthreads_max = 4;
if ( args.m < 3 * GEMM_MULTITHREAD_THRESHOLD )
{
nthreads_max = 2;
if ( args.m <= 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.n < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
if ( args.k < 1 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 1;
}
else
{
if ( args.n < 2 * GEMM_MULTITHREAD_THRESHOLD ) nthreads_max = 2;
}
}
}
#endif
args.common = NULL;
if(args.m <= GEMM_MULTITHREAD_THRESHOLD || args.n <= GEMM_MULTITHREAD_THRESHOLD
|| args.k <=GEMM_MULTITHREAD_THRESHOLD){
args.nthreads = 1;
}else{
args.nthreads = num_cpu_avail(3);
}
if ( nthreads_max > nthreads_avail )
args.nthreads = nthreads_avail;
else
args.nthreads = nthreads_max;
if (args.nthreads == 1) {
#endif

View File

@ -75,7 +75,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY;
blasint lda = *LDA;
FLOAT *buffer;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif
@ -107,7 +107,7 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
blasint info, t;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif
@ -167,15 +167,16 @@ void CNAME(enum CBLAS_ORDER order,
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
#ifdef SMPBUG
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
#endif
GER(m, n, 0, alpha, x, incx, y, incy, a, lda, buffer);
#ifdef SMP
#ifdef SMPBUG
} else {
GER_THREAD(m, n, alpha, x, incx, y, incy, a, lda, buffer, nthreads);

View File

@ -62,7 +62,7 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
#endif
FLOAT du, dp1, dp2, dq2, dq1, dh11, dh21, dh12, dh22, dflag, dtemp;
FLOAT du, dp1, dp2, dq2, dq1, dh11=ZERO, dh21=ZERO, dh12=ZERO, dh22=ZERO, dflag=-ONE, dtemp;
if(*dd1 < ZERO)
{

View File

@ -109,7 +109,7 @@ void NAME(blasint *M, blasint *N, FLOAT *Alpha,
blasint incy = *INCY;
blasint lda = *LDA;
FLOAT *buffer;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif
@ -144,7 +144,7 @@ void CNAME(enum CBLAS_ORDER order,
FLOAT *buffer;
blasint info, t;
#ifdef SMP
#ifdef SMPBUG
int nthreads;
#endif
@ -205,7 +205,7 @@ void CNAME(enum CBLAS_ORDER order,
buffer = (FLOAT *)blas_memory_alloc(1);
#ifdef SMP
#ifdef SMPBUG
nthreads = num_cpu_avail(2);
if (nthreads == 1) {
@ -221,7 +221,7 @@ void CNAME(enum CBLAS_ORDER order,
}
#endif
#ifdef SMP
#ifdef SMPBUG
} else {

View File

@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S

View File

@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S

View File

@ -1,3 +1,7 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S
SGEMMKERNEL = sgemm_kernel_16x4_haswell.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c

View File

@ -1,3 +1,7 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S
SGEMMITCOPY = gemm_tcopy_4.S
@ -9,13 +13,13 @@ SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_core2.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4.S
DGEMMOTCOPY = gemm_tcopy_4.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMKERNEL = gemm_kernel_2x8_nehalem.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
@ -44,11 +48,10 @@ STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S
STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_core2.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_core2.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_core2.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_core2.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S

View File

@ -1,3 +1,6 @@
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S

View File

@ -1,14 +1,16 @@
SGEMMKERNEL = gemm_kernel_4x8_nehalem.S
SGEMMINCOPY = gemm_ncopy_4.S
SGEMMITCOPY = gemm_tcopy_4.S
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMVNKERNEL = sgemv_n.S
SGEMVTKERNEL = sgemv_t.S
SGEMMKERNEL = sgemm_kernel_16x4_sandy.S
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_4x8_sandy.S
DGEMMINCOPY = ../generic/gemm_ncopy_8.c
DGEMMITCOPY = ../generic/gemm_tcopy_8.c

View File

@ -79,8 +79,7 @@
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)

View File

@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384

View File

@ -93,8 +93,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@ -105,7 +104,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384

View File

@ -85,7 +85,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#define STACKSIZE 256
#define L_BUFFER_SIZE 128*8*12+4096
#define L_BUFFER_SIZE 128*8*12+512
#define OLD_A 40 + STACKSIZE(%rsp)
#define OLD_B 48 + STACKSIZE(%rsp)

View File

@ -148,8 +148,8 @@
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)

View File

@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)

View File

@ -78,8 +78,8 @@
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)

View File

@ -105,8 +105,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define LB2_OFFSET 4096
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)

View File

@ -90,8 +90,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@ -101,7 +100,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 64(%rsp)
#define KKK 72(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384

File diff suppressed because it is too large Load Diff

View File

@ -79,8 +79,7 @@
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@ -91,7 +90,6 @@
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384

View File

@ -104,8 +104,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 256*8*4
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@ -116,7 +115,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384

View File

@ -92,8 +92,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define L_BUFFER_SIZE 512*8*4
#define LB2_OFFSET 512*8*2
#define L_BUFFER_SIZE 8192
#define Ndiv6 24(%rsp)
#define Nmod6 32(%rsp)
@ -104,7 +103,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define KK 72(%rsp)
#define KKK 80(%rsp)
#define BUFFER1 128(%rsp)
#define BUFFER2 LB2_OFFSET+128(%rsp)
#if defined(OS_WINDOWS)
#if L_BUFFER_SIZE > 16384

View File

@ -10,7 +10,7 @@ NEP: Data file for testing Nonsymmetric Eigenvalue Problem routines
0 5 7 3 200 Values of INIBL (nibble crossover point)
1 2 4 2 1 Values of ISHFTS (number of simultaneous shifts)
0 1 2 0 1 Values of IACC22 (select structured matrix multiply: 0, 1 or 2)
20.0 Threshold value
70.0 Threshold value
T Put T to test the error exits
1 Code to interpret the seed
NEP 21

21
param.h
View File

@ -1032,14 +1032,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_UNROLL_N 1
#else
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 8
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_N 4
@ -1073,6 +1073,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GETRF_FACTOR 0.72
#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 2
#define ZGEMM3M_DEFAULT_UNROLL_M 8
#endif
@ -1104,14 +1108,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1
#else
#define SGEMM_DEFAULT_UNROLL_M 4
#define SGEMM_DEFAULT_UNROLL_M 16
#define DGEMM_DEFAULT_UNROLL_M 8
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 4
#define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_UNROLL_N 8
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
@ -1119,7 +1123,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_UNROLL_N 1
#endif
#define SGEMM_DEFAULT_P 512
#define SGEMM_DEFAULT_P 768
#define SGEMM_DEFAULT_R sgemm_r
//#define SGEMM_DEFAULT_R 1024
@ -1141,13 +1145,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define XGEMM_DEFAULT_P 252
#define XGEMM_DEFAULT_R xgemm_r
#define SGEMM_DEFAULT_Q 256
#define SGEMM_DEFAULT_Q 384
#define DGEMM_DEFAULT_Q 256
#define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 256
#define ZGEMM_DEFAULT_Q 192
#define XGEMM_DEFAULT_Q 128
#define CGEMM3M_DEFAULT_UNROLL_N 4
#define CGEMM3M_DEFAULT_UNROLL_M 8
#define ZGEMM3M_DEFAULT_UNROLL_N 2
#define ZGEMM3M_DEFAULT_UNROLL_M 8
#define GETRF_FACTOR 0.72
#endif