diff --git a/Makefile.system b/Makefile.system index 9663322fa..2134786c0 100644 --- a/Makefile.system +++ b/Makefile.system @@ -324,14 +324,14 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER HASWELL endif endif diff --git a/common_x86.h b/common_x86.h index 48517d900..49e6be29e 100644 --- a/common_x86.h +++ b/common_x86.h @@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define MMXSTORE movd #endif +#if defined(SANDYBRIDGE) || defined(HASWELL) +//Enable some optimazation for nehalem. +#define NEHALEM_OPTIMIZATION +#endif + #if defined(PILEDRIVER) || defined(BULLDOZER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION diff --git a/common_x86_64.h b/common_x86_64.h index 188903848..8e9d79443 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -218,6 +218,12 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER +#if defined(SANDYBRIDGE) || defined(HASWELL) +//Enable some optimazation for nehalem. +#define NEHALEM_OPTIMIZATION +#endif + + #if defined(PILEDRIVER) || defined(BULLDOZER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION diff --git a/cpuid.h b/cpuid.h index 2cbbd4539..cb4404cb0 100644 --- a/cpuid.h +++ b/cpuid.h @@ -107,7 +107,7 @@ #define CORE_BOBCAT 21 #define CORE_BULLDOZER 22 #define CORE_PILEDRIVER 23 -#define CORE_HASWELL CORE_SANDYBRIDGE +#define CORE_HASWELL 24 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -200,7 +200,6 @@ typedef struct { #define CPUTYPE_BOBCAT 45 #define CPUTYPE_BULLDOZER 46 #define CPUTYPE_PILEDRIVER 47 -// this define is because BLAS doesn't have haswell specific optimizations yet -#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE +#define CPUTYPE_HASWELL 48 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 9e850a2aa..4ed01b891 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1243,6 +1243,7 @@ static char *cpuname[] = { "BOBCAT", "BULLDOZER", "PILEDRIVER", + "HASWELL", }; static char *lowercpuname[] = { @@ -1293,6 +1294,7 @@ static char *lowercpuname[] = { "bobcat", "bulldozer", "piledriver", + "haswell", }; static char *corename[] = { @@ -1320,6 +1322,7 @@ static char *corename[] = { "BOBCAT", "BULLDOZER", "PILEDRIVER", + "HASWELL", }; static char *corename_lower[] = { @@ -1347,6 +1350,7 @@ static char *corename_lower[] = { "bobcat", "bulldozer", "piledriver", + "haswell", }; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 197cc2b2d..bc6c386ec 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -65,14 +65,15 @@ extern gotoblas_t gotoblas_BOBCAT; extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_PILEDRIVER; +extern gotoblas_t gotoblas_HASWELL; #else //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA #endif -//Use sandy bridge kernels for haswell. -#define gotoblas_HASWELL gotoblas_SANDYBRIDGE + #define VENDOR_INTEL 1 #define VENDOR_AMD 2 @@ -285,6 +286,7 @@ static char *corename[] = { "Bobcat", "Bulldozer", "Piledriver", + "Haswell", }; char *gotoblas_corename(void) { @@ -307,7 +309,8 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_BOBCAT) return corename[17]; if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; - if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; + if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; + if (gotoblas == &gotoblas_HASWELL) return corename[20]; return corename[0]; } diff --git a/getarch.c b/getarch.c index 3ffda6244..274d2a868 100644 --- a/getarch.c +++ b/getarch.c @@ -298,6 +298,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "SANDYBRIDGE" #endif +#ifdef FORCE_HASWELL +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "HASWELL" +#define ARCHCONFIG "-DHASWELL " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX " \ + "-DFMA3" +#define LIBNAME "haswell" +#define CORENAME "HASWELL" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/x86/KERNEL.HASWELL b/kernel/x86/KERNEL.HASWELL new file mode 100644 index 000000000..65b03ae50 --- /dev/null +++ b/kernel/x86/KERNEL.HASWELL @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index ebd1377f1..b620f6d12 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index 6fa7d410e..3823aa08b 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 9ce4cd8d4..e60444b08 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index a1a35a7a5..f19047d14 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index a5333640d..4a87539bf 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index c3619ec3d..79222a07a 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index 53e53c3ce..0a646e980 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index 3c056cdff..58d8b4d6d 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 1efa1fd25..1604da17f 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 849afed73..fce813eb4 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index c1833abe2..fda1a9860 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL new file mode 100644 index 000000000..c321be752 --- /dev/null +++ b/kernel/x86_64/KERNEL.HASWELL @@ -0,0 +1,84 @@ +SGEMMKERNEL = sgemm_kernel_8x8_sandy.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = dgemm_kernel_4x8_sandy.S +DGEMMINCOPY = ../generic/gemm_ncopy_8.c +DGEMMITCOPY = ../generic/gemm_tcopy_8.c +#DGEMMONCOPY = gemm_ncopy_4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +#DGEMMOTCOPY = gemm_tcopy_4.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +#CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMKERNEL = cgemm_kernel_4x8_sandy.S +CGEMMINCOPY = ../generic/zgemm_ncopy_8_sandy.c +CGEMMITCOPY = ../generic/zgemm_tcopy_8_sandy.c +CGEMMONCOPY = ../generic/zgemm_ncopy_4_sandy.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4_sandy.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +#ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S +ZGEMMKERNEL = zgemm_kernel_4x4_sandy.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +#STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +#STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +#STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +#STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +#DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S +#DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S +#DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S +#DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S + +#CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +#CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +#CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +#CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +#ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S +#ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S +#ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S +#ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + + +CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index f56490ee3..aad66b4ac 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index bb40ac41e..1b28bcd20 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 653a96ff8..12aab61ed 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index e26088c16..d044dfbe2 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 7f3b54ddb..f818170a3 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 852a81820..7b6c5976d 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index c79e7f1a2..6dabf0735 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 6d0afce18..df64d8045 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#if defined(NEHALEM) || defined(SANDYBRIDGE) +#if defined(NEHALEM) || defined(NEHALEM_OPTIMIZATION) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/param.h b/param.h index 0c3df6951..e4b3871b1 100644 --- a/param.h +++ b/param.h @@ -1150,6 +1150,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef HASWELL + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 8 +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P 512 +#define SGEMM_DEFAULT_R sgemm_r +//#define SGEMM_DEFAULT_R 1024 + +#define DGEMM_DEFAULT_P 512 +#define DGEMM_DEFAULT_R dgemm_r +//#define DGEMM_DEFAULT_R 1024 + +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P 128 +//#define CGEMM_DEFAULT_R cgemm_r +#define CGEMM_DEFAULT_R 1024 + +#define ZGEMM_DEFAULT_P 512 +#define ZGEMM_DEFAULT_R zgemm_r +//#define ZGEMM_DEFAULT_R 1024 + +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 256 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 256 +#define ZGEMM_DEFAULT_Q 192 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.72 + +#endif + #ifdef ATOM