From 19a48b82cf3c4aa25659ea89dce494e2d78fed25 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 30 Mar 2012 20:01:03 +0800 Subject: [PATCH] Init Sandybridge codes based on Nehalem. --- Makefile.system | 5 +- TargetList.txt | 2 + cpuid.h | 3 + cpuid_x86.c | 11 +++- driver/others/parameter.c | 13 ++++- getarch.c | 14 +++++ kernel/setparam-ref.c | 16 ++++++ kernel/x86/KERNEL.SANDYBRIDGE | 1 + kernel/x86/gemm_kernel_2x4_penryn.S | 6 ++ kernel/x86/gemm_kernel_4x4_penryn.S | 8 ++- kernel/x86/gemv_n_sse.S | 2 +- kernel/x86/gemv_n_sse2.S | 2 +- kernel/x86/gemv_t_sse.S | 2 +- kernel/x86/gemv_t_sse2.S | 2 +- kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/zgemm_kernel_1x2_penryn.S | 2 +- kernel/x86/zgemm_kernel_2x2_penryn.S | 2 +- kernel/x86/zgemv_n_sse.S | 2 +- kernel/x86/zgemv_n_sse2.S | 2 +- kernel/x86/zgemv_t_sse.S | 2 +- kernel/x86/zgemv_t_sse2.S | 2 +- kernel/x86/zscal_sse.S | 4 +- kernel/x86/zscal_sse2.S | 4 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.SANDYBRIDGE | 59 ++++++++++++++++++++ kernel/x86_64/dgemm_ncopy_2.S | 6 ++ kernel/x86_64/dgemm_ncopy_4.S | 2 +- kernel/x86_64/dgemm_ncopy_8.S | 6 ++ kernel/x86_64/dgemm_tcopy_2.S | 7 +++ kernel/x86_64/dgemm_tcopy_4.S | 6 ++ kernel/x86_64/dgemm_tcopy_8.S | 7 +++ kernel/x86_64/gemm_ncopy_2.S | 7 +++ kernel/x86_64/gemm_ncopy_4.S | 2 +- kernel/x86_64/gemm_tcopy_2.S | 7 +++ kernel/x86_64/gemm_tcopy_4.S | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zscal_sse.S | 2 +- kernel/x86_64/zscal_sse2.S | 4 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- l1param.h | 7 +++ l2param.h | 11 ++++ param.h | 74 +++++++++++++++++++++++++ 57 files changed, 309 insertions(+), 45 deletions(-) create mode 100644 kernel/x86/KERNEL.SANDYBRIDGE create mode 100644 kernel/x86_64/KERNEL.SANDYBRIDGE diff --git a/Makefile.system b/Makefile.system index 0fd223d60..7c6dce4a5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -226,11 +226,11 @@ endif ifdef DYNAMIC_ARCH ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO endif ifndef DYNAMIC_CORE @@ -740,6 +740,7 @@ export HAVE_SSE4_1 export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 +export HAVE_AVX export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/TargetList.txt b/TargetList.txt index 1c3d7c5b9..9e0db4866 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -18,6 +18,7 @@ CORE2 PENRYN DUNNINGTON NEHALEM +SANDYBRIDGE ATOM b)AMD CPU: @@ -47,6 +48,7 @@ CELL 3.MIPS64 CPU: SICORTEX LOONGSON3A +LOONGSON3B 4.IA64 CPU: ITANIUM2 diff --git a/cpuid.h b/cpuid.h index 665ede077..c0f21698d 100644 --- a/cpuid.h +++ b/cpuid.h @@ -103,6 +103,7 @@ #define CORE_NEHALEM 17 #define CORE_ATOM 18 #define CORE_NANO 19 +#define CORE_SANDYBRIDGE 20 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -122,6 +123,7 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) +#define HAVE_AVX (1 << 18) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -188,4 +190,5 @@ typedef struct { #define CPUTYPE_NSGEODE 41 #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 +#define CPUTYPE_SANDYBRIDGE 44 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index e183e9fc3..9916a662b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -189,6 +189,7 @@ int get_cputype(int gettype){ if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; + if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX; if (have_excpuid() >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); @@ -983,7 +984,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CPUTYPE_NEHALEM; + return CPUTYPE_SANDYBRIDGE; case 12: //Xeon Processor 5600 (Westmere-EP) return CPUTYPE_NEHALEM; @@ -1140,6 +1141,7 @@ static char *cpuname[] = { "NSGEODE", "VIAC3", "NANO", + "SANDYBRIDGE", }; static char *lowercpuname[] = { @@ -1186,6 +1188,7 @@ static char *lowercpuname[] = { "tms3x00", "nsgeode", "nano", + "sandybridge", }; static char *corename[] = { @@ -1209,6 +1212,7 @@ static char *corename[] = { "NEHALEM", "ATOM", "NANO", + "SANDYBRIDGE", }; static char *corename_lower[] = { @@ -1232,6 +1236,7 @@ static char *corename_lower[] = { "nehalem", "atom", "nano", + "sandybridge", }; @@ -1315,7 +1320,7 @@ int get_coretype(void){ return CORE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CORE_NEHALEM; + return CORE_SANDYBRIDGE; case 12: //Xeon Processor 5600 (Westmere-EP) return CORE_NEHALEM; @@ -1414,6 +1419,7 @@ void get_cpuconfig(void){ if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); + if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); @@ -1479,6 +1485,7 @@ void get_sse(void){ if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); + if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 21f56e889..5ff1f2934 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -165,7 +165,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ - defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) + defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -384,6 +384,17 @@ void blas_set_parameter(void){ #endif #endif +#if defined(SANDYBRIDGE) + sgemm_p = 1024; + dgemm_p = 512; + cgemm_p = 512; + zgemm_p = 256; +#ifdef EXPRECISION + qgemm_p = 256; + xgemm_p = 128; +#endif +#endif + #if defined(CORE_PRESCOTT) || defined(GENERIC) size >>= 6; diff --git a/getarch.c b/getarch.c index 5b614472a..d8f467f03 100644 --- a/getarch.c +++ b/getarch.c @@ -278,6 +278,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "NEHALEM" #endif +#ifdef FORCE_SANDYBRIDGE +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index aa45d47f8..e841bb171 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -746,6 +746,22 @@ static void init_parameter(void) { #endif #endif +#ifdef SANDYBRIDGE + +#ifdef DEBUG + fprintf(stderr, "Sandybridge\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef OPTERON #ifdef DEBUG diff --git a/kernel/x86/KERNEL.SANDYBRIDGE b/kernel/x86/KERNEL.SANDYBRIDGE new file mode 100644 index 000000000..65b03ae50 --- /dev/null +++ b/kernel/x86/KERNEL.SANDYBRIDGE @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/gemm_kernel_2x4_penryn.S b/kernel/x86/gemm_kernel_2x4_penryn.S index 263aea042..0bdc9185c 100644 --- a/kernel/x86/gemm_kernel_2x4_penryn.S +++ b/kernel/x86/gemm_kernel_2x4_penryn.S @@ -76,6 +76,12 @@ #define PREFETCHB prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + #ifndef PREFETCH #define PREFETCH prefetcht0 #endif diff --git a/kernel/x86/gemm_kernel_4x4_penryn.S b/kernel/x86/gemm_kernel_4x4_penryn.S index 6775d1d18..2d51d9711 100644 --- a/kernel/x86/gemm_kernel_4x4_penryn.S +++ b/kernel/x86/gemm_kernel_4x4_penryn.S @@ -69,6 +69,12 @@ #define PREFETCHB prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE (16 * 1 - 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + #ifndef PREFETCH #define PREFETCH prefetcht0 #endif @@ -262,7 +268,7 @@ movaps -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 -#ifndef NEHALEM +#if !(defined(NEHALEM) || defined(SANDYBRIDGE)) PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) #endif pshufd $0x93, %xmm1, %xmm2 diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S index aae49a22d..0891657fa 100644 --- a/kernel/x86/gemv_n_sse.S +++ b/kernel/x86/gemv_n_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 4) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S index 669c5ac6c..5f5fa5a51 100644 --- a/kernel/x86/gemv_n_sse2.S +++ b/kernel/x86/gemv_n_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index a4990116d..5bacb7da8 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 4) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index 9960b5c0c..c7e685dd8 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 6645b790e..ebd1377f1 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index bb33918ef..6fa7d410e 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 55c69e49f..9ce4cd8d4 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 11cc104e2..a1a35a7a5 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 01876a515..a5333640d 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index 40a9604d3..c3619ec3d 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/zgemm_kernel_1x2_penryn.S b/kernel/x86/zgemm_kernel_1x2_penryn.S index 849361956..70b38dc79 100644 --- a/kernel/x86/zgemm_kernel_1x2_penryn.S +++ b/kernel/x86/zgemm_kernel_1x2_penryn.S @@ -64,7 +64,7 @@ #define PREFETCHB prefetcht0 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 diff --git a/kernel/x86/zgemm_kernel_2x2_penryn.S b/kernel/x86/zgemm_kernel_2x2_penryn.S index edd89b112..715eb4d4f 100644 --- a/kernel/x86/zgemm_kernel_2x2_penryn.S +++ b/kernel/x86/zgemm_kernel_2x2_penryn.S @@ -64,7 +64,7 @@ #define PREFETCHB prefetcht0 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (16 * 1 + 8) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S index 340b9d375..8e28bb8e6 100644 --- a/kernel/x86/zgemv_n_sse.S +++ b/kernel/x86/zgemv_n_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S index 441fbb0c0..607c51de0 100644 --- a/kernel/x86/zgemv_n_sse2.S +++ b/kernel/x86/zgemv_n_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S index 4312ed173..fb98226ee 100644 --- a/kernel/x86/zgemv_t_sse.S +++ b/kernel/x86/zgemv_t_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S index 78ca14cab..e2f391a82 100644 --- a/kernel/x86/zgemv_t_sse2.S +++ b/kernel/x86/zgemv_t_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S index 849d787f6..53abb697b 100644 --- a/kernel/x86/zscal_sse.S +++ b/kernel/x86/zscal_sse.S @@ -55,7 +55,7 @@ #define XX %edi #define FLAG %ebp -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -697,7 +697,7 @@ cmpl $2 * SIZE, INCX jne .L120 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm1, %xmm1) diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S index 5b1da61e6..26ef693a0 100644 --- a/kernel/x86/zscal_sse2.S +++ b/kernel/x86/zscal_sse2.S @@ -57,7 +57,7 @@ #include "l1param.h" -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -860,7 +860,7 @@ cmpl $2 * SIZE, INCX jne .L220 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) #ifdef HAVE_SSE3 movddup %xmm0, %xmm6 diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index e5949aa6e..53e53c3ce 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index b01498f78..3c056cdff 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 3668ee2bb..1efa1fd25 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 13064166f..849afed73 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index ebff425c0..c1833abe2 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE new file mode 100644 index 000000000..58a883243 --- /dev/null +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x8_nehalem.S +SGEMMINCOPY = gemm_ncopy_4.S +SGEMMITCOPY = gemm_tcopy_4.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x8_nehalem.S +DGEMMINCOPY = dgemm_ncopy_2.S +DGEMMITCOPY = dgemm_tcopy_2.S +DGEMMONCOPY = dgemm_ncopy_8.S +DGEMMOTCOPY = dgemm_tcopy_8.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMINCOPY = zgemm_ncopy_2.S +CGEMMITCOPY = zgemm_tcopy_2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S +ZGEMMINCOPY = zgemm_ncopy_1.S +ZGEMMITCOPY = zgemm_tcopy_1.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/dgemm_ncopy_2.S b/kernel/x86_64/dgemm_ncopy_2.S index 2724cfe92..e4bde49bd 100644 --- a/kernel/x86_64/dgemm_ncopy_2.S +++ b/kernel/x86_64/dgemm_ncopy_2.S @@ -45,6 +45,12 @@ #define PREFETCHW prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef MOVAPS #define MOVAPS movaps #endif diff --git a/kernel/x86_64/dgemm_ncopy_4.S b/kernel/x86_64/dgemm_ncopy_4.S index 52115bd4d..1e4431664 100644 --- a/kernel/x86_64/dgemm_ncopy_4.S +++ b/kernel/x86_64/dgemm_ncopy_4.S @@ -45,7 +45,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/kernel/x86_64/dgemm_ncopy_8.S b/kernel/x86_64/dgemm_ncopy_8.S index 5d3627230..f35c3c5af 100644 --- a/kernel/x86_64/dgemm_ncopy_8.S +++ b/kernel/x86_64/dgemm_ncopy_8.S @@ -45,6 +45,12 @@ #define PREFETCHW prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef MOVAPS #define MOVAPS movaps #endif diff --git a/kernel/x86_64/dgemm_tcopy_2.S b/kernel/x86_64/dgemm_tcopy_2.S index 06e59991d..b0b3590aa 100644 --- a/kernel/x86_64/dgemm_tcopy_2.S +++ b/kernel/x86_64/dgemm_tcopy_2.S @@ -52,6 +52,13 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/dgemm_tcopy_4.S b/kernel/x86_64/dgemm_tcopy_4.S index 8b81c41c0..85b0253d7 100644 --- a/kernel/x86_64/dgemm_tcopy_4.S +++ b/kernel/x86_64/dgemm_tcopy_4.S @@ -51,6 +51,12 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define MOVUPS_A movups +#endif + #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/dgemm_tcopy_8.S b/kernel/x86_64/dgemm_tcopy_8.S index 976033714..3d411cda5 100644 --- a/kernel/x86_64/dgemm_tcopy_8.S +++ b/kernel/x86_64/dgemm_tcopy_8.S @@ -46,6 +46,13 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + #ifdef MOVUPS_A #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS diff --git a/kernel/x86_64/gemm_ncopy_2.S b/kernel/x86_64/gemm_ncopy_2.S index 72c2b9d20..06a0feae9 100644 --- a/kernel/x86_64/gemm_ncopy_2.S +++ b/kernel/x86_64/gemm_ncopy_2.S @@ -46,6 +46,13 @@ #define PREFETCHW prefetcht0 #endif +#if defined(SANDYBRIDGE) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ diff --git a/kernel/x86_64/gemm_ncopy_4.S b/kernel/x86_64/gemm_ncopy_4.S index a04542f6a..cac647fa0 100644 --- a/kernel/x86_64/gemm_ncopy_4.S +++ b/kernel/x86_64/gemm_ncopy_4.S @@ -46,7 +46,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/gemm_tcopy_2.S b/kernel/x86_64/gemm_tcopy_2.S index 8bfaca265..190cebb29 100644 --- a/kernel/x86_64/gemm_tcopy_2.S +++ b/kernel/x86_64/gemm_tcopy_2.S @@ -46,6 +46,13 @@ #define PREFETCHW prefetcht0 #endif +#if defined(SANDYBRIDGE) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ diff --git a/kernel/x86_64/gemm_tcopy_4.S b/kernel/x86_64/gemm_tcopy_4.S index 877969ff5..c2308162f 100644 --- a/kernel/x86_64/gemm_tcopy_4.S +++ b/kernel/x86_64/gemm_tcopy_4.S @@ -46,7 +46,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 901a5ad31..9db45a642 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index bfe7ebd69..ca03f86b7 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 2df76f1cb..01ad2d96e 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index bbba0b427..60c1ea778 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zscal_sse.S b/kernel/x86_64/zscal_sse.S index eb2092dc7..393988e73 100644 --- a/kernel/x86_64/zscal_sse.S +++ b/kernel/x86_64/zscal_sse.S @@ -685,7 +685,7 @@ cmpq $2 * SIZE, INCX jne .L120 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm1 diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S index 23d2da73d..a553bbd39 100644 --- a/kernel/x86_64/zscal_sse2.S +++ b/kernel/x86_64/zscal_sse2.S @@ -55,7 +55,7 @@ #include "l1param.h" -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -803,7 +803,7 @@ cmpq $2 * SIZE, INCX jne .L220 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 39f0ff46f..fc54dc4a5 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 711907711..eae31b955 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 175912c71..4d6ad3326 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 3e4b17030..2623bfe6d 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/l1param.h b/l1param.h index f1d223ea7..61c61aa94 100644 --- a/l1param.h +++ b/l1param.h @@ -9,6 +9,13 @@ #define ALIGNED_ACCESS #endif +#ifdef SANDYBRIDGE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (128 * 12) +#define ALIGNED_ACCESS +#endif + #ifdef ATHLON #define PREFETCH prefetch #define PREFETCHW prefetchw diff --git a/l2param.h b/l2param.h index af9d17179..a371b2ded 100644 --- a/l2param.h +++ b/l2param.h @@ -63,6 +63,17 @@ #define PREFETCHSIZE 64 * 3 #endif +#ifdef SANDYBRIDGE +#define MOVUPS_A movups +#define MOVUPS_XL movups +#define MOVUPS_XS movups +#define MOVUPS_YL movups +#define MOVUPS_YS movups +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 64 * 3 +#endif + #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw diff --git a/param.h b/param.h index 72d721d4e..53159a4fd 100644 --- a/param.h +++ b/param.h @@ -913,6 +913,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef SANDYBRIDGE + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 32 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P 504 +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P 504 +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P 252 +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P 252 +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.72 + +#endif + + + #ifdef ATOM #define SNUMOPT 2