Merge branch 'develop' into sandybridge

This commit is contained in:
Zhang Xianyi 2012-06-16 22:07:17 +08:00
commit 11b4a0e4b6
38 changed files with 489 additions and 148 deletions

View File

@ -247,11 +247,11 @@ endif
ifdef DYNAMIC_ARCH ifdef DYNAMIC_ARCH
ifeq ($(ARCH), x86) ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO
endif endif
ifeq ($(ARCH), x86_64) ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO
endif endif
ifndef DYNAMIC_CORE ifndef DYNAMIC_CORE

View File

@ -28,6 +28,7 @@ OPTERON_SSE3
BARCELONA BARCELONA
SHANGHAI SHANGHAI
ISTANBUL ISTANBUL
BOBCAT
c)VIA CPU: c)VIA CPU:
SSE_GENERIC SSE_GENERIC

View File

@ -356,4 +356,11 @@ REALNAME:
#ifndef ALIGN_6 #ifndef ALIGN_6
#define ALIGN_6 .align 64 #define ALIGN_6 .align 64
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif #endif

View File

@ -448,4 +448,10 @@ REALNAME:
#define ALIGN_6 .align 64 #define ALIGN_6 .align 64
#endif #endif
// ffreep %st(0).
// Because Clang didn't support ffreep, we directly use the opcode.
// Please check out http://www.sandpile.org/x86/opc_fpu.htm
#ifndef ffreep
#define ffreep .byte 0xdf, 0xc0 #
#endif
#endif #endif

View File

@ -104,6 +104,7 @@
#define CORE_ATOM 18 #define CORE_ATOM 18
#define CORE_NANO 19 #define CORE_NANO 19
#define CORE_SANDYBRIDGE 20 #define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21
#define HAVE_SSE (1 << 0) #define HAVE_SSE (1 << 0)
#define HAVE_SSE2 (1 << 1) #define HAVE_SSE2 (1 << 1)
@ -191,4 +192,5 @@ typedef struct {
#define CPUTYPE_VIAC3 42 #define CPUTYPE_VIAC3 42
#define CPUTYPE_NANO 43 #define CPUTYPE_NANO 43
#define CPUTYPE_SANDYBRIDGE 44 #define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCAT 45
#endif #endif

View File

@ -1028,6 +1028,8 @@ int get_cpuname(void){
case 1: case 1:
case 10: case 10:
return CPUTYPE_BARCELONA; return CPUTYPE_BARCELONA;
case 5:
return CPUTYPE_BOBCAT;
} }
break; break;
} }
@ -1148,6 +1150,7 @@ static char *cpuname[] = {
"VIAC3", "VIAC3",
"NANO", "NANO",
"SANDYBRIDGE", "SANDYBRIDGE",
"BOBCAT",
}; };
static char *lowercpuname[] = { static char *lowercpuname[] = {
@ -1195,6 +1198,7 @@ static char *lowercpuname[] = {
"nsgeode", "nsgeode",
"nano", "nano",
"sandybridge", "sandybridge",
"bobcat",
}; };
static char *corename[] = { static char *corename[] = {
@ -1219,6 +1223,7 @@ static char *corename[] = {
"ATOM", "ATOM",
"NANO", "NANO",
"SANDYBRIDGE", "SANDYBRIDGE",
"BOBCAT",
}; };
static char *corename_lower[] = { static char *corename_lower[] = {
@ -1243,6 +1248,7 @@ static char *corename_lower[] = {
"atom", "atom",
"nano", "nano",
"sandybridge", "sandybridge",
"bobcat",
}; };
@ -1351,7 +1357,9 @@ int get_coretype(void){
if (family <= 0x5) return CORE_80486; if (family <= 0x5) return CORE_80486;
if (family <= 0xe) return CORE_ATHLON; if (family <= 0xe) return CORE_ATHLON;
if (family == 0xf){ if (family == 0xf){
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
else if (exfamily == 5) return CORE_BOBCAT;
else return CORE_BARCELONA;
} }
} }

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define MAX_NODES 16 #define MAX_NODES 16
#define MAX_CPUS 256 #define MAX_CPUS 256
#define NCPUBITS (8*sizeof(unsigned long))
#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS)
#define CPUELT(cpu) ((cpu) / NCPUBITS)
#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS))
#define SH_MAGIC 0x510510 #define SH_MAGIC 0x510510
@ -103,10 +108,10 @@ typedef struct {
int num_nodes; int num_nodes;
int num_procs; int num_procs;
int final_num_procs; int final_num_procs;
unsigned long avail; unsigned long avail [MAX_BITMASK_LEN];
int avail_count;
unsigned long cpu_info [MAX_CPUS]; unsigned long cpu_info [MAX_CPUS];
unsigned long node_info [MAX_NODES]; unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN];
int cpu_use[MAX_CPUS]; int cpu_use[MAX_CPUS];
} shm_t; } shm_t;
@ -126,7 +131,8 @@ static shm_t *common = (void *)-1;
static int shmid, pshmid; static int shmid, pshmid;
static void *paddr; static void *paddr;
static unsigned long lprocmask, lnodemask; static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask;
static int lprocmask_count = 0;
static int numprocs = 1; static int numprocs = 1;
static int numnodes = 1; static int numnodes = 1;
@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) {
than sizeof(unsigned long). On 64 bits, the limit than sizeof(unsigned long). On 64 bits, the limit
is 64. On 32 bits, it is 32. is 64. On 32 bits, it is 32.
***/ ***/
static inline unsigned long get_cpumap(int node) { static inline void get_cpumap(int node, unsigned long * node_info) {
int infile; int infile;
unsigned long affinity; unsigned long affinity[32];
char name[160]; char name[160];
char cpumap[160]; char cpumap[160];
char *p, *dummy; char *dummy;
int i=0; int i=0;
int count=0;
int k=0;
sprintf(name, CPUMAP_NAME, node); sprintf(name, CPUMAP_NAME, node);
infile = open(name, O_RDONLY); infile = open(name, O_RDONLY);
for(i=0; i<32; i++){
affinity[i] = 0;
}
affinity = 0;
if (infile != -1) { if (infile != -1) {
read(infile, cpumap, sizeof(cpumap)); read(infile, cpumap, sizeof(cpumap));
p = cpumap;
while (*p != '\n' && i<160){ for(i=0; i<160; i++){
if(*p != ',') { if(cpumap[i] == '\n')
name[i++]=*p; break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
//Enough data for Hex
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
} }
p++;
}
if(k!=0){
name[k]='\0';
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
// revert the sequence
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
node_info[i]=affinity[count-i-1];
} }
p = name;
// while ((*p == '0') || (*p == ',')) p++;
affinity = strtoul(p, &dummy, 16);
close(infile); close(infile);
} }
return affinity; return ;
} }
static inline unsigned long get_share(int cpu, int level) { static inline void get_share(int cpu, int level, unsigned long * share) {
int infile; int infile;
unsigned long affinity; unsigned long affinity[32];
char cpumap[160];
char name[160]; char name[160];
char *p; char *dummy;
int count=0;
int i=0,k=0;
int bitmask_idx = 0;
sprintf(name, SHARE_NAME, cpu, level); sprintf(name, SHARE_NAME, cpu, level);
infile = open(name, O_RDONLY); infile = open(name, O_RDONLY);
affinity = (1UL << cpu); // Init share
for(i=0; i<MAX_BITMASK_LEN; i++){
share[i]=0;
}
bitmask_idx = CPUELT(cpu);
share[bitmask_idx] = CPUMASK(cpu);
if (infile != -1) { if (infile != -1) {
read(infile, name, sizeof(name)); read(infile, cpumap, sizeof(cpumap));
for(i=0; i<160; i++){
if(cpumap[i] == '\n')
break;
if(cpumap[i] != ','){
name[k++]=cpumap[i];
//Enough data
if(k >= NCPUBITS/4){
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
}
}
if(k!=0){
name[k]='\0';
affinity[count++] = strtoul(name, &dummy, 16);
k=0;
}
// 0-63bit -> node_info[0], 64-128bit -> node_info[1] ....
// revert the sequence
for(i=0; i<count && i<MAX_BITMASK_LEN; i++){
share[i]=affinity[count-i-1];
}
p = name;
while ((*p == '0') || (*p == ',')) p++;
affinity = strtol(p, &p, 16);
close(infile); close(infile);
} }
return affinity; return ;
} }
static int numa_check(void) { static int numa_check(void) {
@ -248,6 +298,7 @@ static int numa_check(void) {
DIR *dp; DIR *dp;
struct dirent *dir; struct dirent *dir;
int node; int node;
int j;
common -> num_nodes = 0; common -> num_nodes = 0;
@ -258,7 +309,9 @@ static int numa_check(void) {
return 0; return 0;
} }
for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; for (node = 0; node < MAX_NODES; node ++) {
for (j = 0; j<MAX_BITMASK_LEN; j++) common -> node_info[node][j] = 0;
}
while ((dir = readdir(dp)) != NULL) { while ((dir = readdir(dp)) != NULL) {
if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { if (*(unsigned int *) dir -> d_name == 0x065646f6eU) {
@ -266,12 +319,12 @@ static int numa_check(void) {
node = atoi(&dir -> d_name[4]); node = atoi(&dir -> d_name[4]);
if (node > MAX_NODES) { if (node > MAX_NODES) {
fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n");
exit(1); exit(1);
} }
common -> num_nodes ++; common -> num_nodes ++;
common -> node_info[node] = get_cpumap(node); get_cpumap(node, common->node_info[node]);
} }
} }
@ -284,7 +337,7 @@ static int numa_check(void) {
fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes);
for (node = 0; node < common -> num_nodes; node ++) for (node = 0; node < common -> num_nodes; node ++)
fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]);
#endif #endif
return common -> num_nodes; return common -> num_nodes;
@ -296,11 +349,13 @@ static void numa_mapping(void) {
int i, j, h; int i, j, h;
unsigned long work, bit; unsigned long work, bit;
int count = 0; int count = 0;
int bitmask_idx = 0;
for (node = 0; node < common -> num_nodes; node ++) { for (node = 0; node < common -> num_nodes; node ++) {
core = 0; core = 0;
for (cpu = 0; cpu < common -> num_procs; cpu ++) { for (cpu = 0; cpu < common -> num_procs; cpu ++) {
if (common -> node_info[node] & common -> avail & (1UL << cpu)) { bitmask_idx = CPUELT(cpu);
if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) {
common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu);
count ++; count ++;
core ++; core ++;
@ -357,58 +412,89 @@ static void numa_mapping(void) {
static void disable_hyperthread(void) { static void disable_hyperthread(void) {
unsigned long share; unsigned long share[MAX_BITMASK_LEN];
int cpu; int cpu;
int bitmask_idx = 0;
int i=0, count=0;
bitmask_idx = CPUELT(common -> num_procs);
if(common->num_procs > 64){ for(i=0; i< bitmask_idx; i++){
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL;
exit(1); }
}else if(common->num_procs == 64){ if(CPUMASK(common -> num_procs) != 1){
common -> avail = 0xFFFFFFFFFFFFFFFFUL; common -> avail[count++] = CPUMASK(common -> num_procs) - 1;
}else }
common -> avail = (1UL << common -> num_procs) - 1; common -> avail_count = count;
/* if(common->num_procs > 64){ */
/* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */
/* exit(1); */
/* }else if(common->num_procs == 64){ */
/* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */
/* }else */
/* common -> avail = (1UL << common -> num_procs) - 1; */
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); fprintf(stderr, "\nAvail CPUs : ");
for(i=0; i<count; i++)
fprintf(stderr, "%04lx ", common -> avail[i]);
fprintf(stderr, ".\n");
#endif #endif
for (cpu = 0; cpu < common -> num_procs; cpu ++) { for (cpu = 0; cpu < common -> num_procs; cpu ++) {
share = (get_share(cpu, 1) & common -> avail); get_share(cpu, 1, share);
if (popcount(share) > 1) { //When the shared cpu are in different element of share & avail array, this may be a bug.
for (i = 0; i < count ; i++){
if (popcount(share[i]) > 1) {
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n",
cpu, share & ~(1UL << cpu)); cpu, share[i] & ~(CPUMASK(cpu)));
#endif #endif
common -> avail &= ~((share & ~(1UL << cpu))); common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu)));
}
} }
} }
} }
static void disable_affinity(void) { static void disable_affinity(void) {
int i=0;
int bitmask_idx=0;
int count=0;
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]);
fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]);
#endif #endif
if(common->final_num_procs > 64){ /* if(common->final_num_procs > 64){ */
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */
exit(1); /* exit(1); */
}else if(common->final_num_procs == 64){ /* }else if(common->final_num_procs == 64){ */
lprocmask = 0xFFFFFFFFFFFFFFFFUL; /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */
}else /* }else */
lprocmask = (1UL << common -> final_num_procs) - 1; /* lprocmask = (1UL << common -> final_num_procs) - 1; */
bitmask_idx = CPUELT(common -> final_num_procs);
for(i=0; i< bitmask_idx; i++){
lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL;
}
if(CPUMASK(common -> final_num_procs) != 1){
lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1;
}
lprocmask_count = count;
#ifndef USE_OPENMP #ifndef USE_OPENMP
lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; for(i=0; i< count; i++){
lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i];
}
#endif #endif
#ifdef DEBUG #ifdef DEBUG
fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]);
#endif #endif
} }
@ -498,7 +584,7 @@ static void create_pshmem(void) {
static void local_cpu_map(void) { static void local_cpu_map(void) {
int cpu, id, mapping; int cpu, id, mapping;
int bitmask_idx = 0;
cpu = 0; cpu = 0;
mapping = 0; mapping = 0;
@ -508,8 +594,9 @@ static void local_cpu_map(void) {
if (id > 0) { if (id > 0) {
if (is_dead(id)) common -> cpu_use[cpu] = 0; if (is_dead(id)) common -> cpu_use[cpu] = 0;
} }
if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { bitmask_idx = CPUELT(cpu);
if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) {
common -> cpu_use[cpu] = pshmid; common -> cpu_use[cpu] = pshmid;
cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]);
@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) {
#ifndef USE_OPENMP #ifndef USE_OPENMP
cpu_set_t cpu_mask; cpu_set_t cpu_mask;
#endif #endif
int i;
if (initialized) return; if (initialized) return;
@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) {
common -> num_procs = get_nprocs(); common -> num_procs = get_nprocs();
if(common -> num_procs > MAX_CPUS) {
fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS);
exit(1);
}
for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu;
numa_check(); numa_check();
@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) {
if (common -> num_nodes > 1) numa_mapping(); if (common -> num_nodes > 1) numa_mapping();
common -> final_num_procs = popcount(common -> avail); common -> final_num_procs = 0;
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]);
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;
@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) {
disable_affinity(); disable_affinity();
num_avail = popcount(lprocmask); num_avail = 0;
for(i=0; i<lprocmask_count; i++) num_avail += popcount(lprocmask[i]);
if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail; if ((numprocs <= 0) || (numprocs > num_avail)) numprocs = num_avail;

View File

@ -163,7 +163,7 @@ int get_L2_size(void){
int eax, ebx, ecx, edx; int eax, ebx, ecx, edx;
#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \
defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \
defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC)
@ -446,7 +446,7 @@ void blas_set_parameter(void){
#endif #endif
#endif #endif
#if defined(CORE_BARCELONA) #if defined(CORE_BARCELONA) || defined(CORE_BOBCAT)
size >>= 8; size >>= 8;
sgemm_p = 232 * size; sgemm_p = 232 * size;

View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_BARCELONA */ /* #define FORCE_BARCELONA */
/* #define FORCE_SHANGHAI */ /* #define FORCE_SHANGHAI */
/* #define FORCE_ISTANBUL */ /* #define FORCE_ISTANBUL */
/* #define FORCE_BOBCAT */
/* #define FORCE_SSE_GENERIC */ /* #define FORCE_SSE_GENERIC */
/* #define FORCE_VIAC3 */ /* #define FORCE_VIAC3 */
/* #define FORCE_NANO */ /* #define FORCE_NANO */
@ -363,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BARCELONA" #define CORENAME "BARCELONA"
#endif #endif
#if defined(FORCE_BOBCAT)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
#define SUBARCHITECTURE "BOBCAT"
#define ARCHCONFIG "-DBOBCAT " \
"-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=524288 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV"
#define LIBNAME "bobcat"
#define CORENAME "BOBCAT"
#endif
#ifdef FORCE_SSE_GENERIC #ifdef FORCE_SSE_GENERIC
#define FORCE #define FORCE
#define FORCE_INTEL #define FORCE_INTEL

View File

@ -794,6 +794,22 @@ static void init_parameter(void) {
#endif #endif
#endif #endif
#ifdef BOBCAT
#ifdef DEBUG
fprintf(stderr, "Bobcate\n");
#endif
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
#endif
#endif
#ifdef NANO #ifdef NANO
#ifdef DEBUG #ifdef DEBUG

59
kernel/x86/KERNEL.BOBCAT Normal file
View File

@ -0,0 +1,59 @@
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
SGEMMINCOPY =
SGEMMITCOPY =
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ =
SGEMMITCOPYOBJ =
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
CGEMMINCOPY =
CGEMMITCOPY =
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ =
CGEMMITCOPYOBJ =
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -439,7 +439,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -488,7 +488,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@ -1697,7 +1697,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1727,7 +1727,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -437,7 +437,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -833,7 +833,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -1848,7 +1848,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2109,7 +2109,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2429,7 +2429,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -2459,7 +2459,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -2952,7 +2952,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@ -3148,7 +3148,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -3389,7 +3389,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -3404,7 +3404,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -910,7 +910,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -959,7 +959,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2
@ -1439,7 +1439,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1469,7 +1469,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -872,7 +872,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -1316,7 +1316,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2
@ -1855,7 +1855,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1885,7 +1885,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -2249,7 +2249,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2562,7 +2562,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2957,7 +2957,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -2972,7 +2972,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -3280,7 +3280,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -3515,7 +3515,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0

View File

@ -69,7 +69,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHSIZE (8 * 10 + 4) #define PREFETCHSIZE (8 * 10 + 4)
#endif #endif
@ -1036,7 +1036,7 @@
.L42: .L42:
mulpd %xmm0, %xmm2 mulpd %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulpd 2 * SIZE(BB), %xmm0 mulpd 2 * SIZE(BB), %xmm0
@ -1066,7 +1066,7 @@
addpd %xmm0, %xmm7 addpd %xmm0, %xmm7
movapd 16 * SIZE(AA), %xmm0 movapd 16 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulpd %xmm1, %xmm2 mulpd %xmm1, %xmm2
@ -2224,7 +2224,7 @@
.L22: .L22:
mulsd %xmm0, %xmm2 mulsd %xmm0, %xmm2
addsd %xmm2, %xmm4 addsd %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movlpd 2 * SIZE(BB), %xmm2 movlpd 2 * SIZE(BB), %xmm2
@ -2273,7 +2273,7 @@
movlpd 40 * SIZE(BB), %xmm3 movlpd 40 * SIZE(BB), %xmm3
addsd %xmm0, %xmm7 addsd %xmm0, %xmm7
movlpd 8 * SIZE(AA), %xmm0 movlpd 8 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
#endif #endif
mulsd %xmm1, %xmm2 mulsd %xmm1, %xmm2

View File

@ -64,7 +64,7 @@
#define BORIG 60(%esp) #define BORIG 60(%esp)
#define BUFFER 128(%esp) #define BUFFER 128(%esp)
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
@ -439,7 +439,7 @@
.L92: .L92:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(AA), %xmm0 movaps 4 * SIZE(AA), %xmm0
@ -454,7 +454,7 @@
mulps 12 * SIZE(BB), %xmm0 mulps 12 * SIZE(BB), %xmm0
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm3 mulps %xmm1, %xmm3
@ -758,7 +758,7 @@
.L102: .L102:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movsd 2 * SIZE(AA), %xmm0 movsd 2 * SIZE(AA), %xmm0
@ -993,7 +993,7 @@
.L112: .L112:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 1 * SIZE(AA), %xmm0 movss 1 * SIZE(AA), %xmm0
@ -1324,7 +1324,7 @@
.L52: .L52:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulps 4 * SIZE(BB), %xmm0 mulps 4 * SIZE(BB), %xmm0
@ -1354,7 +1354,7 @@
addps %xmm0, %xmm5 addps %xmm0, %xmm5
movaps 32 * SIZE(AA), %xmm0 movaps 32 * SIZE(AA), %xmm0
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
@ -1718,7 +1718,7 @@
ALIGN_4 ALIGN_4
.L62: .L62:
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
@ -2031,7 +2031,7 @@
.L72: .L72:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
mulss 4 * SIZE(BB), %xmm0 mulss 4 * SIZE(BB), %xmm0
@ -2859,7 +2859,7 @@
.L22: .L22:
mulps %xmm0, %xmm2 mulps %xmm0, %xmm2
addps %xmm2, %xmm4 addps %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movaps 4 * SIZE(BB), %xmm2 movaps 4 * SIZE(BB), %xmm2
@ -3303,7 +3303,7 @@
.L32: .L32:
mulss %xmm0, %xmm2 mulss %xmm0, %xmm2
addss %xmm2, %xmm4 addss %xmm2, %xmm4
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
#endif #endif
movss 4 * SIZE(BB), %xmm2 movss 4 * SIZE(BB), %xmm2

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -533,7 +533,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -994,7 +994,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -75,7 +75,7 @@
#define STACK_ALIGN 4096 #define STACK_ALIGN 4096
#define STACK_OFFSET 1024 #define STACK_OFFSET 1024
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
#define PREFETCHSIZE (16 * 10 + 8) #define PREFETCHSIZE (16 * 10 + 8)
#define WPREFETCHSIZE 112 #define WPREFETCHSIZE 112
#define PREFETCH prefetch #define PREFETCH prefetch
@ -1820,7 +1820,7 @@
addps %xmm0, %xmm7 addps %xmm0, %xmm7
movsd 16 * SIZE(AA), %xmm0 movsd 16 * SIZE(AA), %xmm0
mulps %xmm1, %xmm2 mulps %xmm1, %xmm2
#if defined(OPTERON) || defined(BARCELONA) #if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT)
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
#endif #endif
addps %xmm2, %xmm4 addps %xmm2, %xmm4

View File

@ -0,0 +1,62 @@
ZGEMVNKERNEL = zgemv_n_dup.S
ZGEMVTKERNEL = zgemv_t_dup.S
SGEMMKERNEL = gemm_kernel_8x4_barcelona.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = gemm_ncopy_4_opteron.S
SGEMMOTCOPY = gemm_tcopy_4_opteron.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = gemm_kernel_4x4_barcelona.S
DGEMMINCOPY =
DGEMMITCOPY =
DGEMMONCOPY = gemm_ncopy_4_opteron.S
DGEMMOTCOPY = gemm_tcopy_4_opteron.S
DGEMMINCOPYOBJ =
DGEMMITCOPYOBJ =
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = zgemm_ncopy_2.S
CGEMMOTCOPY = zgemm_tcopy_2.S
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
ZGEMMINCOPY =
ZGEMMITCOPY =
ZGEMMONCOPY = zgemm_ncopy_2.S
ZGEMMOTCOPY = zgemm_tcopy_2.S
ZGEMMINCOPYOBJ =
ZGEMMITCOPYOBJ =
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S
STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S
STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S
DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S
DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S
DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S
CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S
CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S
CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S
ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S
ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S
ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S

View File

@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -160,7 +160,7 @@
#define a3 %xmm14 #define a3 %xmm14
#define xt1 %xmm15 #define xt1 %xmm15
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) #if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP(a, b, c) movddup a(b), c
#define MOVDDUP2(a, b, c) movddup a##b, c #define MOVDDUP2(a, b, c) movddup a##b, c
#else #else

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -76,7 +76,7 @@
#define movsd movlpd #define movsd movlpd
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHSIZE (16 * 16) #define PREFETCHSIZE (16 * 16)

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -86,7 +86,7 @@
#define BORIG 72(%rsp) #define BORIG 72(%rsp)
#define BUFFER 128(%rsp) #define BUFFER 128(%rsp)
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -95,7 +95,7 @@
#define PREFETCHSIZE (8 * 6 + 4) #define PREFETCHSIZE (8 * 6 + 4)
#endif #endif
#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) #if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define PREFETCH prefetch #define PREFETCH prefetch
#define PREFETCHW prefetchw #define PREFETCHW prefetchw
#define PREFETCHNTA prefetchnta #define PREFETCHNTA prefetchnta

View File

@ -67,6 +67,13 @@
#define ALIGNED_ACCESS #define ALIGNED_ACCESS
#endif #endif
#ifdef BOBCAT
#define PREFETCH prefetch
#define PREFETCHW prefetchw
#define PREFETCHSIZE (128 * 5)
#define ALIGNED_ACCESS
#endif
#ifdef NANO #ifdef NANO
#define PREFETCH prefetcht0 #define PREFETCH prefetcht0
#define PREFETCHW prefetcht0 #define PREFETCHW prefetcht0

View File

@ -85,7 +85,7 @@
#define movsd movlps #define movsd movlps
#endif #endif
#if defined(BARCELONA) || defined(SHANGHAI) #if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT)
#define ALIGNED_ACCESS #define ALIGNED_ACCESS
#define MOVUPS_A movaps #define MOVUPS_A movaps
#define MOVUPS_XL movaps #define MOVUPS_XL movaps

64
param.h
View File

@ -1,5 +1,5 @@
/***************************************************************************** /*****************************************************************************
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS
All rights reserved. All rights reserved.
Redistribution and use in source and binary forms, with or without Redistribution and use in source and binary forms, with or without
@ -208,6 +208,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif #endif
#define SGEMM_DEFAULT_R sgemm_r
#define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r
#define XGEMM_DEFAULT_R xgemm_r
#define SYMV_P 16
#define HAVE_EXCLUSIVE_CACHE
#define GEMM_THREAD gemm_thread_mn
#endif
#if defined(BOBCAT)
#define SNUMOPT 8
#define DNUMOPT 4
#define GEMM_DEFAULT_OFFSET_A 64
#define GEMM_DEFAULT_OFFSET_B 832
#define GEMM_DEFAULT_ALIGN 0x0fffUL
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define XGEMM_DEFAULT_UNROLL_N 1
#ifdef ARCH_X86
#define SGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#else
#define SGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_M 4
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_M 2
#define XGEMM_DEFAULT_UNROLL_M 1
#endif
#define SGEMM_DEFAULT_P 448
#define DGEMM_DEFAULT_P 224
#define QGEMM_DEFAULT_P 112
#define CGEMM_DEFAULT_P 224
#define ZGEMM_DEFAULT_P 112
#define XGEMM_DEFAULT_P 56
#define SGEMM_DEFAULT_Q 224
#define DGEMM_DEFAULT_Q 224
#define QGEMM_DEFAULT_Q 224
#define CGEMM_DEFAULT_Q 224
#define ZGEMM_DEFAULT_Q 224
#define XGEMM_DEFAULT_Q 224
#define SGEMM_DEFAULT_R sgemm_r #define SGEMM_DEFAULT_R sgemm_r
#define QGEMM_DEFAULT_R qgemm_r #define QGEMM_DEFAULT_R qgemm_r
#define DGEMM_DEFAULT_R dgemm_r #define DGEMM_DEFAULT_R dgemm_r