Refs #113. Support AMD Bobcate using Barcelona kernel codes. Replace 3DNow! with MMX.

This commit is contained in:
Zhang Xianyi 2012-05-31 18:17:45 +08:00
parent 37edae1c90
commit d6cab3f37e
29 changed files with 303 additions and 70 deletions

View File

@ -247,11 +247,11 @@ endif
ifdef DYNAMIC_ARCH
ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO
endif
ifndef DYNAMIC_CORE

View File

@ -28,6 +28,7 @@ OPTERON_SSE3
BARCELONA
SHANGHAI
ISTANBUL
BOBCATE
c)VIA CPU:
SSE_GENERIC

View File

@ -104,6 +104,7 @@
#define CORE_ATOM 18
#define CORE_NANO 19
#define CORE_SANDYBRIDGE 20
#define CORE_BOBCATE 21
#define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1)
@ -191,4 +192,5 @@ typedef struct {
#define CPUTYPE_VIAC3 42
#define CPUTYPE_NANO 43
#define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCATE 45
#endif

View File

@ -1028,6 +1028,8 @@ int get_cpuname(void){
case 1:
case 10:
return CPUTYPE_BARCELONA;
case 5:
return CPUTYPE_BOBCATE;
}
break;
}
@ -1148,6 +1150,7 @@ static char *cpuname[] = {
"VIAC3",
"NANO",
"SANDYBRIDGE",
"BOBCATE",
};
static char *lowercpuname[] = {
@ -1195,6 +1198,7 @@ static char *lowercpuname[] = {
"nsgeode",
"nano",
"sandybridge",
"bobcate",
};
static char *corename[] = {
@ -1219,6 +1223,7 @@ static char *corename[] = {
"ATOM",
"NANO",
"SANDYBRIDGE",
"BOBCATE",
};
static char *corename_lower[] = {
@ -1243,6 +1248,7 @@ static char *corename_lower[] = {
"atom",
"nano",
"sandybridge",
"bobcate",
};
@ -1351,7 +1357,9 @@ int get_coretype(void){
if (family <= 0x5) return CORE_80486;
if (family <= 0xe) return CORE_ATHLON;
if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA;
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCATE;
else return CORE_BARCELONA;
}
}

View File

@ -163,7 +163,7 @@ int get_L2_size(void){
int eax, ebx, ecx, edx;
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
@ -446,7 +446,7 @@ void blas_set_parameter(void){
#endif
#endif
#if defined(CORE_BARCELONA)
#if defined(CORE_BARCELONA) || defined(CORE_BOBCATE)
size >>= 8;
sgemm_p = 232 * size;

View File

@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_BARCELONA */
/* #define FORCE_SHANGHAI */
/* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCATE */
/* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */
/* #define FORCE_NANO */
@ -363,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BARCELONA"
#endif
#if defined(FORCE_BOBCATE)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BOBCATE"
#define ARCHCONFIG "-DBOBCATE " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV"
#define LIBNAME "bobcate"
#define CORENAME "BOBCATE"
#endif
#ifdef FORCE_SSE_GENERIC
#define FORCE
#define FORCE_INTEL

View File

@ -794,6 +794,22 @@ static void init_parameter(void) {
#endif
#endif
#ifdef BOBCATE
#ifdef DEBUG
fprintf(stderr, "Bobcate\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef NANO
#ifdef DEBUG

59
kernel/x86/KERNEL.BOBCATE Normal file
View File

@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@ -439,7 +439,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@ -1697,7 +1697,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@ -437,7 +437,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@ -833,7 +833,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@ -1848,7 +1848,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@ -2109,7 +2109,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@ -2429,7 +2429,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@ -2952,7 +2952,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@ -3148,7 +3148,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@ -3389,7 +3389,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@ -910,7 +910,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2
@ -1439,7 +1439,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@ -872,7 +872,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@ -1316,7 +1316,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2
@ -1855,7 +1855,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@ -2249,7 +2249,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@ -2562,7 +2562,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@ -2957,7 +2957,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@ -3280,7 +3280,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@ -3515,7 +3515,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4)
#endif
@ -1036,7 +1036,7 @@
.L42:
mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulpd 2 * SIZE(BB), %xmm0
@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulpd %xmm1, %xmm2
@ -2224,7 +2224,7 @@
.L22:
mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movlpd 2 * SIZE(BB), %xmm2
@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif
mulsd %xmm1, %xmm2

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp)
#define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8)
@ -439,7 +439,7 @@
.L92:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(AA), %xmm0
@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm3
@ -758,7 +758,7 @@
.L102:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movsd 2 * SIZE(AA), %xmm0
@ -993,7 +993,7 @@
.L112:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 1 * SIZE(AA), %xmm0
@ -1324,7 +1324,7 @@
.L52:
mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulps 4 * SIZE(BB), %xmm0
@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
mulps %xmm1, %xmm2
@ -1718,7 +1718,7 @@
ALIGN_4
.L62:
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
@ -2031,7 +2031,7 @@
.L72:
mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
mulss 4 * SIZE(BB), %xmm0
@ -2859,7 +2859,7 @@
.L22:
mulps %xmm0, %xmm2
addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movaps 4 * SIZE(BB), %xmm2
@ -3303,7 +3303,7 @@
.L32:
mulss %xmm0, %xmm2
addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif
movss 4 * SIZE(BB), %xmm2

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@ -533,7 +533,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@ -994,7 +994,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096
#define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
#define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112
#define PREFETCH prefetch
@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA)
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif
addps %xmm2, %xmm4

View File

@ -0,0 +1,62 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

View File

@ -76,7 +76,7 @@
#define movsd movlps
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlps
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -160,7 +160,7 @@
#define a3 %xmm14
#define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI)
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c
#else

View File

@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16)

View File

@ -67,6 +67,13 @@
#define ALIGNED_ACCESS
#endif
#ifdef BOBCATE
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (128 * 5)
#define ALIGNED_ACCESS
#endif
#ifdef NANO
#define PREFETCH prefetcht0
#define PREFETCHW prefetcht0

View File

@ -85,7 +85,7 @@
#define movsd movlps
#endif
#if defined(BARCELONA) || defined(SHANGHAI)
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE)
#define ALIGNED_ACCESS
#define MOVUPS_A movaps
#define MOVUPS_XL movaps

64
param.h
View File

@ -1,5 +1,5 @@
/*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved.
Redistribution and use in source and binary forms, with or without
@ -208,6 +208,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#define SGEMM_DEFAULT_R sgemm_r
#define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r
#define XGEMM_DEFAULT_R xgemm_r
#define SYMV_P 16
#define HAVE_EXCLUSIVE_CACHE
#define GEMM_THREAD gemm_thread_mn
#endif
#if defined(BOBCATE)
#define SNUMOPT 8
#define DNUMOPT 4
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1
#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#else
#define SGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_M 4
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_M 2
#define XGEMM_DEFAULT_UNROLL_M 1
#endif
#define SGEMM_DEFAULT_P 448
#define DGEMM_DEFAULT_P 224
#define QGEMM_DEFAULT_P 112
#define CGEMM_DEFAULT_P 224
#define ZGEMM_DEFAULT_P 112
#define XGEMM_DEFAULT_P 56
#define SGEMM_DEFAULT_Q 224
#define DGEMM_DEFAULT_Q 224
#define QGEMM_DEFAULT_Q 224
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 224
#define XGEMM_DEFAULT_Q 224
#define SGEMM_DEFAULT_R sgemm_r
#define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R dgemm_r