From 4319769b79c24cc5ca5559a53b37241d4770c322 Mon Sep 17 00:00:00 2001 From: Werner Saar Date: Sun, 28 Dec 2014 20:16:46 +0800 Subject: [PATCH] added target processor STEAMROLLER --- Makefile.rule | 2 +- Makefile.system | 8 ++- README.md | 1 + TargetList.txt | 1 + common_x86.h | 2 +- common_x86_64.h | 2 +- cpuid.h | 10 ++-- cpuid_x86.c | 18 +++++++ driver/others/dynamic.c | 15 ++++++ driver/others/parameter.c | 4 +- getarch.c | 17 ++++++ kernel/setparam-ref.c | 17 ++++++ kernel/x86_64/KERNEL.STEAMROLLER | 76 ++++++++++++++++++++++++++ kernel/x86_64/ddot.c | 2 +- kernel/x86_64/sdot.c | 2 +- kernel/x86_64/sgemv_n_4.c | 2 +- kernel/x86_64/sgemv_t_4.c | 2 +- kernel/x86_64/zgemv_t_4.c | 2 +- param.h | 93 ++++++++++++++++++++++++++++++++ 19 files changed, 261 insertions(+), 15 deletions(-) create mode 100644 kernel/x86_64/KERNEL.STEAMROLLER diff --git a/Makefile.rule b/Makefile.rule index d3a2d1fa3..4bd1ab110 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -15,7 +15,7 @@ VERSION = 0.2.13 # TARGET = PENRYN # If you want to support multiple architecture in one binary -# DYNAMIC_ARCH = 1 +DYNAMIC_ARCH = 1 # C compiler including binary type(32bit / 64bit). Default is gcc. # Don't use Intel Compiler or PGI, it won't generate right codes as I expect. diff --git a/Makefile.system b/Makefile.system index ec6339d62..e3e2d5204 100644 --- a/Makefile.system +++ b/Makefile.system @@ -61,6 +61,9 @@ endif ifeq ($(TARGET), PILEDRIVER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET), STEAMROLLER) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -85,6 +88,9 @@ endif ifeq ($(TARGET_CORE), PILEDRIVER) GETARCH_FLAGS := -DFORCE_BARCELONA endif +ifeq ($(TARGET_CORE), STEAMROLLER) +GETARCH_FLAGS := -DFORCE_BARCELONA +endif endif @@ -392,7 +398,7 @@ endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER STEAMROLLER endif ifneq ($(NO_AVX2), 1) DYNAMIC_CORE += HASWELL diff --git a/README.md b/README.md index f4c547701..cdacf9888 100644 --- a/README.md +++ b/README.md @@ -60,6 +60,7 @@ Please read GotoBLAS_01Readme.txt - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. - **AMD Bulldozer**: x86-64 ?GEMM FMA4 kernels. (Thank Werner Saar) - **AMD PILEDRIVER**: Uses Bulldozer codes with some optimizations. +- **AMD STEAMROLLER**: Uses Bulldozer codes with some optimizations. #### MIPS64: - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. diff --git a/TargetList.txt b/TargetList.txt index 97661fdcf..c91401f01 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -32,6 +32,7 @@ ISTANBUL BOBCAT BULLDOZER PILEDRIVER +STEAMROLLER c)VIA CPU: SSE_GENERIC diff --git a/common_x86.h b/common_x86.h index f97fd348a..9d82090cc 100644 --- a/common_x86.h +++ b/common_x86.h @@ -171,7 +171,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define MMXSTORE movd #endif -#if defined(PILEDRIVER) || defined(BULLDOZER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/common_x86_64.h b/common_x86_64.h index 547614f74..e0a6c4c42 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -226,7 +226,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #ifdef ASSEMBLER -#if defined(PILEDRIVER) || defined(BULLDOZER) +#if defined(PILEDRIVER) || defined(BULLDOZER) || defined(STEAMROLLER) //Enable some optimazation for barcelona. #define BARCELONA_OPTIMIZATION #endif diff --git a/cpuid.h b/cpuid.h index cb4404cb0..ab6a3fb32 100644 --- a/cpuid.h +++ b/cpuid.h @@ -104,10 +104,11 @@ #define CORE_ATOM 18 #define CORE_NANO 19 #define CORE_SANDYBRIDGE 20 -#define CORE_BOBCAT 21 -#define CORE_BULLDOZER 22 +#define CORE_BOBCAT 21 +#define CORE_BULLDOZER 22 #define CORE_PILEDRIVER 23 -#define CORE_HASWELL 24 +#define CORE_HASWELL 24 +#define CORE_STEAMROLLER 25 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -200,6 +201,7 @@ typedef struct { #define CPUTYPE_BOBCAT 45 #define CPUTYPE_BULLDOZER 46 #define CPUTYPE_PILEDRIVER 47 -#define CPUTYPE_HASWELL 48 +#define CPUTYPE_HASWELL 48 +#define CPUTYPE_STEAMROLLER 49 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 44446e582..ef90b26d8 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1162,6 +1162,12 @@ int get_cpuname(void){ return CPUTYPE_PILEDRIVER; else return CPUTYPE_BARCELONA; //OS don't support AVX. + case 0: + if(support_avx()) + return CPUTYPE_STEAMROLLER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. + } break; case 5: @@ -1290,6 +1296,7 @@ static char *cpuname[] = { "BULLDOZER", "PILEDRIVER", "HASWELL", + "STEAMROLLER", }; static char *lowercpuname[] = { @@ -1341,6 +1348,7 @@ static char *lowercpuname[] = { "bulldozer", "piledriver", "haswell", + "steamroller", }; static char *corename[] = { @@ -1369,6 +1377,7 @@ static char *corename[] = { "BULLDOZER", "PILEDRIVER", "HASWELL", + "STEAMROLLER", }; static char *corename_lower[] = { @@ -1397,6 +1406,7 @@ static char *corename_lower[] = { "bulldozer", "piledriver", "haswell", + "steamroller", }; @@ -1562,7 +1572,15 @@ int get_coretype(void){ return CORE_PILEDRIVER; else return CORE_BARCELONA; //OS don't support AVX. + + case 0: + if(support_avx()) + return CORE_STEAMROLLER; + else + return CORE_BARCELONA; //OS don't support AVX. } + + }else return CORE_BARCELONA; } } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 1235df2db..6fd1d8cdf 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -66,6 +66,7 @@ extern gotoblas_t gotoblas_BOBCAT; extern gotoblas_t gotoblas_SANDYBRIDGE; extern gotoblas_t gotoblas_BULLDOZER; extern gotoblas_t gotoblas_PILEDRIVER; +extern gotoblas_t gotoblas_STEAMROLLER; #ifdef NO_AVX2 #define gotoblas_HASWELL gotoblas_SANDYBRIDGE #else @@ -77,6 +78,7 @@ extern gotoblas_t gotoblas_HASWELL; #define gotoblas_HASWELL gotoblas_NEHALEM #define gotoblas_BULLDOZER gotoblas_BARCELONA #define gotoblas_PILEDRIVER gotoblas_BARCELONA +#define gotoblas_STEAMROLLER gotoblas_BARCELONA #endif @@ -275,7 +277,17 @@ static gotoblas_t *get_coretype(void){ openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } + }else if(model == 0){ + //AMD STEAMROLLER + if(support_avx()) + return &gotoblas_STEAMROLLER; + else{ + openblas_warning(FALLBACK_VERBOSE, BARCELONA_FALLBACK); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } } + + } else { return &gotoblas_BARCELONA; } @@ -315,6 +327,7 @@ static char *corename[] = { "Bulldozer", "Piledriver", "Haswell", + "Steamroller", }; char *gotoblas_corename(void) { @@ -339,6 +352,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; if (gotoblas == &gotoblas_PILEDRIVER) return corename[19]; if (gotoblas == &gotoblas_HASWELL) return corename[20]; + if (gotoblas == &gotoblas_STEAMROLLER) return corename[21]; return corename[0]; } @@ -370,6 +384,7 @@ static gotoblas_t *force_coretype(char *coretype){ switch (found) { + case 21: return (&gotoblas_STEAMROLLER); case 20: return (&gotoblas_HASWELL); case 19: return (&gotoblas_PILEDRIVER); case 18: return (&gotoblas_BULLDOZER); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index f0f889a15..d741f2fb9 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -166,7 +166,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) || \ - defined(PILEDRIVER) || defined(HASWELL) + defined(PILEDRIVER) || defined(HASWELL) || defined(STEAMROLLER) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -251,7 +251,7 @@ void blas_set_parameter(void){ env_var_t p; int factor; -#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) || defined(STEAMROLLER) int size = 16; #else int size = get_L2_size(); diff --git a/getarch.c b/getarch.c index 81ab9e37c..f6a5ecb94 100644 --- a/getarch.c +++ b/getarch.c @@ -432,6 +432,23 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "PILEDRIVER" #endif +#if defined (FORCE_STEAMROLLER) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "STEAMROLLER" +#define ARCHCONFIG "-DSTEAMROLLER " \ + "-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH " \ + "-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3" +#define LIBNAME "steamroller" +#define CORENAME "STEAMROLLER" +#endif + + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 0d7bbd4ac..1fa7f7984 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -941,6 +941,23 @@ static void init_parameter(void) { #endif #endif +#ifdef STEAMROLLER + +#ifdef DEBUG + fprintf(stderr, "Steamroller\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + + #ifdef NANO #ifdef DEBUG diff --git a/kernel/x86_64/KERNEL.STEAMROLLER b/kernel/x86_64/KERNEL.STEAMROLLER new file mode 100644 index 000000000..55285e3d3 --- /dev/null +++ b/kernel/x86_64/KERNEL.STEAMROLLER @@ -0,0 +1,76 @@ +SGEMVNKERNEL = sgemv_n_4.c +SGEMVTKERNEL = sgemv_t_4.c + +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_4.c + +DGEMVNKERNEL = dgemv_n_bulldozer.S +DGEMVTKERNEL = dgemv_t_bulldozer.S + +DDOTKERNEL = ddot_bulldozer.S +DCOPYKERNEL = dcopy_bulldozer.S + +SGEMMKERNEL = sgemm_kernel_16x2_piledriver.S +SGEMMINCOPY = ../generic/gemm_ncopy_16.c +SGEMMITCOPY = ../generic/gemm_tcopy_16.c +SGEMMONCOPY = gemm_ncopy_2_bulldozer.S +SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) + +DGEMMKERNEL = dgemm_kernel_8x2_piledriver.S +DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S +DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S +DGEMMONCOPY = gemm_ncopy_2_bulldozer.S +DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMMKERNEL = cgemm_kernel_4x2_piledriver.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_piledriver.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S + +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S +DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c + + diff --git a/kernel/x86_64/ddot.c b/kernel/x86_64/ddot.c index b3aad438f..d501c2f68 100644 --- a/kernel/x86_64/ddot.c +++ b/kernel/x86_64/ddot.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "ddot_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "ddot_microk_nehalem-2.c" diff --git a/kernel/x86_64/sdot.c b/kernel/x86_64/sdot.c index 632d16810..6fec48175 100644 --- a/kernel/x86_64/sdot.c +++ b/kernel/x86_64/sdot.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "sdot_microk_bulldozer-2.c" #elif defined(NEHALEM) #include "sdot_microk_nehalem-2.c" diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c index a840f8ba9..930dd26b2 100644 --- a/kernel/x86_64/sgemv_n_4.c +++ b/kernel/x86_64/sgemv_n_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "sgemv_n_microk_bulldozer-4.c" #elif defined(NEHALEM) #include "sgemv_n_microk_nehalem-4.c" diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c index cd13bb67d..2bb5809ea 100644 --- a/kernel/x86_64/sgemv_t_4.c +++ b/kernel/x86_64/sgemv_t_4.c @@ -30,7 +30,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(NEHALEM) #include "sgemv_t_microk_nehalem-4.c" -#elif defined(BULLDOZER) || defined(PILEDRIVER) +#elif defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "sgemv_t_microk_bulldozer-4.c" #elif defined(SANDYBRIDGE) #include "sgemv_t_microk_sandy-4.c" diff --git a/kernel/x86_64/zgemv_t_4.c b/kernel/x86_64/zgemv_t_4.c index 84cf4e2e8..4abb2d5ad 100644 --- a/kernel/x86_64/zgemv_t_4.c +++ b/kernel/x86_64/zgemv_t_4.c @@ -29,7 +29,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" -#if defined(BULLDOZER) || defined(PILEDRIVER) +#if defined(BULLDOZER) || defined(PILEDRIVER) || defined(STEAMROLLER) #include "zgemv_t_microk_bulldozer-4.c" #elif defined(HASWELL) #include "zgemv_t_microk_haswell-4.c" diff --git a/param.h b/param.h index bce05c957..e3e535b14 100644 --- a/param.h +++ b/param.h @@ -406,6 +406,99 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef STEAMROLLER +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + + + +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_N 2 +#define SGEMM_DEFAULT_UNROLL_M 16 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#define CGEMM3M_DEFAULT_UNROLL_N 4 +#define CGEMM3M_DEFAULT_UNROLL_M 8 +#define ZGEMM3M_DEFAULT_UNROLL_N 4 +#define ZGEMM3M_DEFAULT_UNROLL_M 4 +#define GEMV_UNROLL 8 +#endif + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_P 768 +#define DGEMM_DEFAULT_P 768 +#define ZGEMM_DEFAULT_P 384 +#define CGEMM_DEFAULT_P 768 +#else +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 480 +#define ZGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#endif +#define QGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#if defined(ARCH_X86_64) +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 168 +#define ZGEMM_DEFAULT_Q 168 +#define CGEMM_DEFAULT_Q 168 +#else +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#endif +#define QGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + +#define CGEMM3M_DEFAULT_P 448 +#define ZGEMM3M_DEFAULT_P 224 +#define XGEMM3M_DEFAULT_P 112 +#define CGEMM3M_DEFAULT_Q 224 +#define ZGEMM3M_DEFAULT_Q 224 +#define XGEMM3M_DEFAULT_Q 224 +#define CGEMM3M_DEFAULT_R 12288 +#define ZGEMM3M_DEFAULT_R 12288 +#define XGEMM3M_DEFAULT_R 12288 + +#define SGEMM_DEFAULT_R 12288 +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R 12288 +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + + #ifdef ATHLON #define SNUMOPT 4