Support AMD Piledriver by bulldozer kernels.
This commit is contained in:
parent
0c4074e10b
commit
886cbaf4e4
|
@ -311,14 +311,14 @@ ifeq ($(ARCH), x86)
|
||||||
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
|
||||||
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
ifneq ($(NO_AVX), 1)
|
ifneq ($(NO_AVX), 1)
|
||||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
|
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), x86_64)
|
ifeq ($(ARCH), x86_64)
|
||||||
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
|
||||||
ifneq ($(NO_AVX), 1)
|
ifneq ($(NO_AVX), 1)
|
||||||
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER
|
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
|
|
@ -48,6 +48,7 @@ Please read GotoBLAS_01Readme.txt
|
||||||
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
|
- **Intel Haswell**: Optimized Level-3 BLAS with AVX on x86-64 (identical to Sandy Bridge).
|
||||||
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes.
|
||||||
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
|
- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar)
|
||||||
|
- **AMD PILEDRIVER**: Used Bulldozer codes.
|
||||||
|
|
||||||
#### MIPS64:
|
#### MIPS64:
|
||||||
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.
|
||||||
|
|
|
@ -171,6 +171,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
#define MMXSTORE movd
|
#define MMXSTORE movd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined(PILEDRIVER) || defined(BULLDOZER)
|
||||||
|
//Enable some optimazation for barcelona.
|
||||||
|
#define BARCELONA_OPTIMIZATION
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(HAVE_3DNOW)
|
#if defined(HAVE_3DNOW)
|
||||||
#define EMMS femms
|
#define EMMS femms
|
||||||
#elif defined(HAVE_MMX)
|
#elif defined(HAVE_MMX)
|
||||||
|
|
|
@ -218,6 +218,11 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){
|
||||||
|
|
||||||
#ifdef ASSEMBLER
|
#ifdef ASSEMBLER
|
||||||
|
|
||||||
|
#if defined(PILEDRIVER) || defined(BULLDOZER)
|
||||||
|
//Enable some optimazation for barcelona.
|
||||||
|
#define BARCELONA_OPTIMIZATION
|
||||||
|
#endif
|
||||||
|
|
||||||
#if defined(HAVE_3DNOW)
|
#if defined(HAVE_3DNOW)
|
||||||
#define EMMS femms
|
#define EMMS femms
|
||||||
#elif defined(HAVE_MMX)
|
#elif defined(HAVE_MMX)
|
||||||
|
|
3
cpuid.h
3
cpuid.h
|
@ -106,6 +106,7 @@
|
||||||
#define CORE_SANDYBRIDGE 20
|
#define CORE_SANDYBRIDGE 20
|
||||||
#define CORE_BOBCAT 21
|
#define CORE_BOBCAT 21
|
||||||
#define CORE_BULLDOZER 22
|
#define CORE_BULLDOZER 22
|
||||||
|
#define CORE_PILEDRIVER 23
|
||||||
#define CORE_HASWELL CORE_SANDYBRIDGE
|
#define CORE_HASWELL CORE_SANDYBRIDGE
|
||||||
|
|
||||||
#define HAVE_SSE (1 << 0)
|
#define HAVE_SSE (1 << 0)
|
||||||
|
@ -128,6 +129,7 @@
|
||||||
#define HAVE_FASTMOVU (1 << 17)
|
#define HAVE_FASTMOVU (1 << 17)
|
||||||
#define HAVE_AVX (1 << 18)
|
#define HAVE_AVX (1 << 18)
|
||||||
#define HAVE_FMA4 (1 << 19)
|
#define HAVE_FMA4 (1 << 19)
|
||||||
|
#define HAVE_FMA3 (1 << 20)
|
||||||
|
|
||||||
#define CACHE_INFO_L1_I 1
|
#define CACHE_INFO_L1_I 1
|
||||||
#define CACHE_INFO_L1_D 2
|
#define CACHE_INFO_L1_D 2
|
||||||
|
@ -197,6 +199,7 @@ typedef struct {
|
||||||
#define CPUTYPE_SANDYBRIDGE 44
|
#define CPUTYPE_SANDYBRIDGE 44
|
||||||
#define CPUTYPE_BOBCAT 45
|
#define CPUTYPE_BOBCAT 45
|
||||||
#define CPUTYPE_BULLDOZER 46
|
#define CPUTYPE_BULLDOZER 46
|
||||||
|
#define CPUTYPE_PILEDRIVER 47
|
||||||
// this define is because BLAS doesn't have haswell specific optimizations yet
|
// this define is because BLAS doesn't have haswell specific optimizations yet
|
||||||
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
|
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE
|
||||||
|
|
||||||
|
|
47
cpuid_x86.c
47
cpuid_x86.c
|
@ -47,6 +47,8 @@
|
||||||
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
#define CORE_SANDYBRIDGE CORE_NEHALEM
|
||||||
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
|
||||||
#define CORE_BULLDOZER CORE_BARCELONA
|
#define CORE_BULLDOZER CORE_BARCELONA
|
||||||
|
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
|
||||||
|
#define CORE_PILEDRIVER CORE_BARCELONA
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#ifndef CPUIDEMU
|
#ifndef CPUIDEMU
|
||||||
|
@ -228,6 +230,7 @@ int get_cputype(int gettype){
|
||||||
#ifndef NO_AVX
|
#ifndef NO_AVX
|
||||||
if (support_avx()) feature |= HAVE_AVX;
|
if (support_avx()) feature |= HAVE_AVX;
|
||||||
#endif
|
#endif
|
||||||
|
if ((ecx & (1 << 20)) != 0) feature |= HAVE_FMA3;
|
||||||
|
|
||||||
if (have_excpuid() >= 0x01) {
|
if (have_excpuid() >= 0x01) {
|
||||||
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
cpuid(0x80000001, &eax, &ebx, &ecx, &edx);
|
||||||
|
@ -1100,11 +1103,21 @@ int get_cpuname(void){
|
||||||
case 1:
|
case 1:
|
||||||
case 10:
|
case 10:
|
||||||
return CPUTYPE_BARCELONA;
|
return CPUTYPE_BARCELONA;
|
||||||
case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
case 6:
|
||||||
if(support_avx())
|
switch (model) {
|
||||||
return CPUTYPE_BULLDOZER;
|
case 1:
|
||||||
else
|
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||||
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
if(support_avx())
|
||||||
|
return CPUTYPE_BULLDOZER;
|
||||||
|
else
|
||||||
|
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||||
|
case 2:
|
||||||
|
if(support_avx())
|
||||||
|
return CPUTYPE_PILEDRIVER;
|
||||||
|
else
|
||||||
|
return CPUTYPE_BARCELONA; //OS don't support AVX.
|
||||||
|
}
|
||||||
|
break;
|
||||||
case 5:
|
case 5:
|
||||||
return CPUTYPE_BOBCAT;
|
return CPUTYPE_BOBCAT;
|
||||||
}
|
}
|
||||||
|
@ -1229,6 +1242,7 @@ static char *cpuname[] = {
|
||||||
"SANDYBRIDGE",
|
"SANDYBRIDGE",
|
||||||
"BOBCAT",
|
"BOBCAT",
|
||||||
"BULLDOZER",
|
"BULLDOZER",
|
||||||
|
"PILEDRIVER",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *lowercpuname[] = {
|
static char *lowercpuname[] = {
|
||||||
|
@ -1278,6 +1292,7 @@ static char *lowercpuname[] = {
|
||||||
"sandybridge",
|
"sandybridge",
|
||||||
"bobcat",
|
"bobcat",
|
||||||
"bulldozer",
|
"bulldozer",
|
||||||
|
"piledriver",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename[] = {
|
static char *corename[] = {
|
||||||
|
@ -1304,6 +1319,7 @@ static char *corename[] = {
|
||||||
"SANDYBRIDGE",
|
"SANDYBRIDGE",
|
||||||
"BOBCAT",
|
"BOBCAT",
|
||||||
"BULLDOZER",
|
"BULLDOZER",
|
||||||
|
"PILEDRIVER",
|
||||||
};
|
};
|
||||||
|
|
||||||
static char *corename_lower[] = {
|
static char *corename_lower[] = {
|
||||||
|
@ -1330,6 +1346,7 @@ static char *corename_lower[] = {
|
||||||
"sandybridge",
|
"sandybridge",
|
||||||
"bobcat",
|
"bobcat",
|
||||||
"bulldozer",
|
"bulldozer",
|
||||||
|
"piledriver",
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
@ -1472,11 +1489,19 @@ int get_coretype(void){
|
||||||
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON;
|
||||||
else if (exfamily == 5) return CORE_BOBCAT;
|
else if (exfamily == 5) return CORE_BOBCAT;
|
||||||
else if (exfamily == 6) {
|
else if (exfamily == 6) {
|
||||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
switch (model) {
|
||||||
if(support_avx())
|
case 1:
|
||||||
return CORE_BULLDOZER;
|
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||||
else
|
if(support_avx())
|
||||||
return CORE_BARCELONA; //OS don't support AVX. Use old kernels.
|
return CORE_BULLDOZER;
|
||||||
|
else
|
||||||
|
return CORE_BARCELONA; //OS don't support AVX.
|
||||||
|
case 2:
|
||||||
|
if(support_avx())
|
||||||
|
return CORE_PILEDRIVER;
|
||||||
|
else
|
||||||
|
return CORE_BARCELONA; //OS don't support AVX.
|
||||||
|
}
|
||||||
}else return CORE_BARCELONA;
|
}else return CORE_BARCELONA;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -1564,6 +1589,7 @@ void get_cpuconfig(void){
|
||||||
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n");
|
||||||
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n");
|
||||||
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n");
|
||||||
|
if (features & HAVE_FMA3 ) printf("#define HAVE_FMA3\n");
|
||||||
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
|
if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n");
|
||||||
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
|
if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n");
|
||||||
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
|
if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n");
|
||||||
|
@ -1631,5 +1657,6 @@ void get_sse(void){
|
||||||
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n");
|
||||||
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n");
|
||||||
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n");
|
||||||
|
if (features & HAVE_FMA3 ) printf("HAVE_FMA3=1\n");
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
|
@ -64,10 +64,12 @@ extern gotoblas_t gotoblas_BOBCAT;
|
||||||
#ifndef NO_AVX
|
#ifndef NO_AVX
|
||||||
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
extern gotoblas_t gotoblas_SANDYBRIDGE;
|
||||||
extern gotoblas_t gotoblas_BULLDOZER;
|
extern gotoblas_t gotoblas_BULLDOZER;
|
||||||
|
extern gotoblas_t gotoblas_PILEDRIVER;
|
||||||
#else
|
#else
|
||||||
//Use NEHALEM kernels for sandy bridge
|
//Use NEHALEM kernels for sandy bridge
|
||||||
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
|
||||||
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
#define gotoblas_BULLDOZER gotoblas_BARCELONA
|
||||||
|
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
|
||||||
#endif
|
#endif
|
||||||
//Use sandy bridge kernels for haswell.
|
//Use sandy bridge kernels for haswell.
|
||||||
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE
|
||||||
|
@ -228,13 +230,23 @@ static gotoblas_t *get_coretype(void){
|
||||||
} else if (exfamily == 5) {
|
} else if (exfamily == 5) {
|
||||||
return &gotoblas_BOBCAT;
|
return &gotoblas_BOBCAT;
|
||||||
} else if (exfamily == 6) {
|
} else if (exfamily == 6) {
|
||||||
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
if(model == 1){
|
||||||
|
//AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series
|
||||||
if(support_avx())
|
if(support_avx())
|
||||||
return &gotoblas_BULLDOZER;
|
return &gotoblas_BULLDOZER;
|
||||||
else{
|
else{
|
||||||
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
|
||||||
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||||
}
|
}
|
||||||
|
}else if(model == 2){
|
||||||
|
//AMD Bulldozer Opteron 6300 / Opteron 4300 / Opteron 3300
|
||||||
|
if(support_avx())
|
||||||
|
return &gotoblas_PILEDRIVER;
|
||||||
|
else{
|
||||||
|
fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n");
|
||||||
|
return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels.
|
||||||
|
}
|
||||||
|
}
|
||||||
} else {
|
} else {
|
||||||
return &gotoblas_BARCELONA;
|
return &gotoblas_BARCELONA;
|
||||||
}
|
}
|
||||||
|
@ -272,6 +284,7 @@ static char *corename[] = {
|
||||||
"Sandybridge",
|
"Sandybridge",
|
||||||
"Bobcat",
|
"Bobcat",
|
||||||
"Bulldozer",
|
"Bulldozer",
|
||||||
|
"Piledriver",
|
||||||
};
|
};
|
||||||
|
|
||||||
char *gotoblas_corename(void) {
|
char *gotoblas_corename(void) {
|
||||||
|
@ -294,6 +307,7 @@ char *gotoblas_corename(void) {
|
||||||
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16];
|
||||||
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
if (gotoblas == &gotoblas_BOBCAT) return corename[17];
|
||||||
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
if (gotoblas == &gotoblas_BULLDOZER) return corename[18];
|
||||||
|
if (gotoblas == &gotoblas_PILEDRIVER) return corename[19];
|
||||||
|
|
||||||
return corename[0];
|
return corename[0];
|
||||||
}
|
}
|
||||||
|
|
17
getarch.c
17
getarch.c
|
@ -106,6 +106,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
/* #define FORCE_ISTANBUL */
|
/* #define FORCE_ISTANBUL */
|
||||||
/* #define FORCE_BOBCAT */
|
/* #define FORCE_BOBCAT */
|
||||||
/* #define FORCE_BULLDOZER */
|
/* #define FORCE_BULLDOZER */
|
||||||
|
/* #define FORCE_PILEDRIVER */
|
||||||
/* #define FORCE_SSE_GENERIC */
|
/* #define FORCE_SSE_GENERIC */
|
||||||
/* #define FORCE_VIAC3 */
|
/* #define FORCE_VIAC3 */
|
||||||
/* #define FORCE_NANO */
|
/* #define FORCE_NANO */
|
||||||
|
@ -398,6 +399,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#define CORENAME "BULLDOZER"
|
#define CORENAME "BULLDOZER"
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#if defined (FORCE_PILEDRIVER)
|
||||||
|
#define FORCE
|
||||||
|
#define FORCE_INTEL
|
||||||
|
#define ARCHITECTURE "X86"
|
||||||
|
#define SUBARCHITECTURE "PILEDRIVER"
|
||||||
|
#define ARCHCONFIG "-DPILEDRIVER " \
|
||||||
|
"-DL1_DATA_SIZE=16384 -DL1_DATA_LINESIZE=64 " \
|
||||||
|
"-DL2_SIZE=2097152 -DL2_LINESIZE=64 -DL3_SIZE=12582912 " \
|
||||||
|
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \
|
||||||
|
"-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2" \
|
||||||
|
"-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU -DHAVE_CFLUSH" \
|
||||||
|
"-DHAVE_AVX -DHAVE_FMA4 -DHAVE_FMA3"
|
||||||
|
#define LIBNAME "piledriver"
|
||||||
|
#define CORENAME "PILEDRIVER"
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef FORCE_SSE_GENERIC
|
#ifdef FORCE_SSE_GENERIC
|
||||||
#define FORCE
|
#define FORCE
|
||||||
#define FORCE_INTEL
|
#define FORCE_INTEL
|
||||||
|
|
|
@ -826,6 +826,22 @@ static void init_parameter(void) {
|
||||||
#endif
|
#endif
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#ifdef PILEDRIVER
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
fprintf(stderr, "Piledriver\n");
|
||||||
|
#endif
|
||||||
|
|
||||||
|
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||||
|
#ifdef EXPRECISION
|
||||||
|
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
|
||||||
|
#endif
|
||||||
|
#endif
|
||||||
|
|
||||||
#ifdef NANO
|
#ifdef NANO
|
||||||
|
|
||||||
#ifdef DEBUG
|
#ifdef DEBUG
|
||||||
|
|
|
@ -0,0 +1,59 @@
|
||||||
|
SGEMMKERNEL = gemm_kernel_4x4_barcelona.S
|
||||||
|
SGEMMINCOPY =
|
||||||
|
SGEMMITCOPY =
|
||||||
|
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
SGEMMINCOPYOBJ =
|
||||||
|
SGEMMITCOPYOBJ =
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = gemm_kernel_2x4_barcelona.S
|
||||||
|
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
|
||||||
|
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
|
||||||
|
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
|
||||||
|
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S
|
||||||
|
CGEMMINCOPY =
|
||||||
|
CGEMMITCOPY =
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMINCOPYOBJ =
|
||||||
|
CGEMMITCOPYOBJ =
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S
|
||||||
|
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
|
||||||
|
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S
|
||||||
|
STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S
|
||||||
|
STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S
|
||||||
|
STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S
|
||||||
|
DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S
|
||||||
|
CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S
|
||||||
|
CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S
|
||||||
|
CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S
|
||||||
|
ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -439,7 +439,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -488,7 +488,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
@ -1697,7 +1697,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1727,7 +1727,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -437,7 +437,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -833,7 +833,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1848,7 +1848,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2109,7 +2109,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2429,7 +2429,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2459,7 +2459,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -2952,7 +2952,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
@ -3148,7 +3148,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -3389,7 +3389,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -3404,7 +3404,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -910,7 +910,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -959,7 +959,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
@ -1439,7 +1439,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1469,7 +1469,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -872,7 +872,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1316,7 +1316,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
@ -1855,7 +1855,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1885,7 +1885,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -2249,7 +2249,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2562,7 +2562,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2957,7 +2957,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -2972,7 +2972,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -3280,7 +3280,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -3515,7 +3515,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
|
|
@ -69,7 +69,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHSIZE (8 * 10 + 4)
|
#define PREFETCHSIZE (8 * 10 + 4)
|
||||||
#endif
|
#endif
|
||||||
|
@ -1036,7 +1036,7 @@
|
||||||
|
|
||||||
.L42:
|
.L42:
|
||||||
mulpd %xmm0, %xmm2
|
mulpd %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd 2 * SIZE(BB), %xmm0
|
mulpd 2 * SIZE(BB), %xmm0
|
||||||
|
@ -1066,7 +1066,7 @@
|
||||||
addpd %xmm0, %xmm7
|
addpd %xmm0, %xmm7
|
||||||
movapd 16 * SIZE(AA), %xmm0
|
movapd 16 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulpd %xmm1, %xmm2
|
mulpd %xmm1, %xmm2
|
||||||
|
@ -2224,7 +2224,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulsd %xmm0, %xmm2
|
mulsd %xmm0, %xmm2
|
||||||
addsd %xmm2, %xmm4
|
addsd %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movlpd 2 * SIZE(BB), %xmm2
|
movlpd 2 * SIZE(BB), %xmm2
|
||||||
|
@ -2273,7 +2273,7 @@
|
||||||
movlpd 40 * SIZE(BB), %xmm3
|
movlpd 40 * SIZE(BB), %xmm3
|
||||||
addsd %xmm0, %xmm7
|
addsd %xmm0, %xmm7
|
||||||
movlpd 8 * SIZE(AA), %xmm0
|
movlpd 8 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
PREFETCH (PREFETCHSIZE + 8) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulsd %xmm1, %xmm2
|
mulsd %xmm1, %xmm2
|
||||||
|
|
|
@ -64,7 +64,7 @@
|
||||||
#define BORIG 60(%esp)
|
#define BORIG 60(%esp)
|
||||||
#define BUFFER 128(%esp)
|
#define BUFFER 128(%esp)
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
|
@ -439,7 +439,7 @@
|
||||||
|
|
||||||
.L92:
|
.L92:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(AA), %xmm0
|
movaps 4 * SIZE(AA), %xmm0
|
||||||
|
@ -454,7 +454,7 @@
|
||||||
mulps 12 * SIZE(BB), %xmm0
|
mulps 12 * SIZE(BB), %xmm0
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm3
|
mulps %xmm1, %xmm3
|
||||||
|
@ -758,7 +758,7 @@
|
||||||
|
|
||||||
.L102:
|
.L102:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movsd 2 * SIZE(AA), %xmm0
|
movsd 2 * SIZE(AA), %xmm0
|
||||||
|
@ -993,7 +993,7 @@
|
||||||
|
|
||||||
.L112:
|
.L112:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 1 * SIZE(AA), %xmm0
|
movss 1 * SIZE(AA), %xmm0
|
||||||
|
@ -1324,7 +1324,7 @@
|
||||||
|
|
||||||
.L52:
|
.L52:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps 4 * SIZE(BB), %xmm0
|
mulps 4 * SIZE(BB), %xmm0
|
||||||
|
@ -1354,7 +1354,7 @@
|
||||||
addps %xmm0, %xmm5
|
addps %xmm0, %xmm5
|
||||||
movaps 32 * SIZE(AA), %xmm0
|
movaps 32 * SIZE(AA), %xmm0
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
|
@ -1718,7 +1718,7 @@
|
||||||
ALIGN_4
|
ALIGN_4
|
||||||
|
|
||||||
.L62:
|
.L62:
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
@ -2031,7 +2031,7 @@
|
||||||
|
|
||||||
.L72:
|
.L72:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
mulss 4 * SIZE(BB), %xmm0
|
mulss 4 * SIZE(BB), %xmm0
|
||||||
|
@ -2859,7 +2859,7 @@
|
||||||
.L22:
|
.L22:
|
||||||
mulps %xmm0, %xmm2
|
mulps %xmm0, %xmm2
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movaps 4 * SIZE(BB), %xmm2
|
movaps 4 * SIZE(BB), %xmm2
|
||||||
|
@ -3303,7 +3303,7 @@
|
||||||
.L32:
|
.L32:
|
||||||
mulss %xmm0, %xmm2
|
mulss %xmm0, %xmm2
|
||||||
addss %xmm2, %xmm4
|
addss %xmm2, %xmm4
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
movss 4 * SIZE(BB), %xmm2
|
movss 4 * SIZE(BB), %xmm2
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -533,7 +533,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -994,7 +994,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -75,7 +75,7 @@
|
||||||
#define STACK_ALIGN 4096
|
#define STACK_ALIGN 4096
|
||||||
#define STACK_OFFSET 1024
|
#define STACK_OFFSET 1024
|
||||||
|
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCHSIZE (16 * 10 + 8)
|
#define PREFETCHSIZE (16 * 10 + 8)
|
||||||
#define WPREFETCHSIZE 112
|
#define WPREFETCHSIZE 112
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
|
@ -1820,7 +1820,7 @@
|
||||||
addps %xmm0, %xmm7
|
addps %xmm0, %xmm7
|
||||||
movsd 16 * SIZE(AA), %xmm0
|
movsd 16 * SIZE(AA), %xmm0
|
||||||
mulps %xmm1, %xmm2
|
mulps %xmm1, %xmm2
|
||||||
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA)
|
||||||
#endif
|
#endif
|
||||||
addps %xmm2, %xmm4
|
addps %xmm2, %xmm4
|
||||||
|
|
|
@ -0,0 +1,70 @@
|
||||||
|
ZGEMVNKERNEL = zgemv_n_dup.S
|
||||||
|
ZGEMVTKERNEL = zgemv_t_dup.S
|
||||||
|
|
||||||
|
DGEMVNKERNEL = dgemv_n_bulldozer.S
|
||||||
|
DGEMVTKERNEL = dgemv_t_bulldozer.S
|
||||||
|
DAXPYKERNEL = daxpy_bulldozer.S
|
||||||
|
DDOTKERNEL = ddot_bulldozer.S
|
||||||
|
DCOPYKERNEL = dcopy_bulldozer.S
|
||||||
|
|
||||||
|
SGEMMKERNEL = sgemm_kernel_16x2_bulldozer.S
|
||||||
|
SGEMMINCOPY = ../generic/gemm_ncopy_16.c
|
||||||
|
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
|
||||||
|
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||||
|
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||||
|
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMKERNEL = dgemm_kernel_8x2_bulldozer.S
|
||||||
|
DGEMMINCOPY = dgemm_ncopy_8_bulldozer.S
|
||||||
|
DGEMMITCOPY = dgemm_tcopy_8_bulldozer.S
|
||||||
|
DGEMMONCOPY = gemm_ncopy_2_bulldozer.S
|
||||||
|
DGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
|
||||||
|
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMKERNEL = cgemm_kernel_4x2_bulldozer.S
|
||||||
|
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
|
||||||
|
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
|
||||||
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMKERNEL = zgemm_kernel_2x2_bulldozer.S
|
||||||
|
ZGEMMINCOPY =
|
||||||
|
ZGEMMITCOPY =
|
||||||
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
|
ZGEMMINCOPYOBJ =
|
||||||
|
ZGEMMITCOPYOBJ =
|
||||||
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
|
CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S
|
||||||
|
ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S
|
||||||
|
|
||||||
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
|
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||||
|
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||||
|
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
|
|
@ -160,7 +160,7 @@
|
||||||
#define a3 %xmm14
|
#define a3 %xmm14
|
||||||
#define xt1 %xmm15
|
#define xt1 %xmm15
|
||||||
|
|
||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
@ -167,7 +167,7 @@
|
||||||
#define a3 %xmm14
|
#define a3 %xmm14
|
||||||
#define xt1 %xmm15
|
#define xt1 %xmm15
|
||||||
|
|
||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
@ -166,7 +166,7 @@
|
||||||
#define xt1 %xmm14
|
#define xt1 %xmm14
|
||||||
#define xt2 %xmm15
|
#define xt2 %xmm15
|
||||||
|
|
||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -76,7 +76,7 @@
|
||||||
#define movsd movlpd
|
#define movsd movlpd
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define PREFETCH prefetch
|
#define PREFETCH prefetch
|
||||||
#define PREFETCHW prefetchw
|
#define PREFETCHW prefetchw
|
||||||
#define PREFETCHSIZE (16 * 16)
|
#define PREFETCHSIZE (16 * 16)
|
||||||
|
@ -166,7 +166,7 @@
|
||||||
#define a3 %xmm14
|
#define a3 %xmm14
|
||||||
#define xt1 %xmm15
|
#define xt1 %xmm15
|
||||||
|
|
||||||
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER)
|
#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define MOVDDUP(a, b, c) movddup a(b), c
|
#define MOVDDUP(a, b, c) movddup a(b), c
|
||||||
#define MOVDDUP2(a, b, c) movddup a##b, c
|
#define MOVDDUP2(a, b, c) movddup a##b, c
|
||||||
#else
|
#else
|
||||||
|
|
|
@ -85,7 +85,7 @@
|
||||||
#define movsd movlps
|
#define movsd movlps
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER)
|
#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BARCELONA_OPTIMIZATION)
|
||||||
#define ALIGNED_ACCESS
|
#define ALIGNED_ACCESS
|
||||||
#define MOVUPS_A movaps
|
#define MOVUPS_A movaps
|
||||||
#define MOVUPS_XL movaps
|
#define MOVUPS_XL movaps
|
||||||
|
|
80
param.h
80
param.h
|
@ -234,6 +234,86 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
#define QGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
#define XGEMM_DEFAULT_UNROLL_N 1
|
||||||
|
|
||||||
|
#ifdef ARCH_X86
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_M 4
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_M 2
|
||||||
|
#define QGEMM_DEFAULT_UNROLL_M 2
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_M 1
|
||||||
|
#define XGEMM_DEFAULT_UNROLL_M 1
|
||||||
|
#else
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
#define SGEMM_DEFAULT_UNROLL_M 16
|
||||||
|
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||||
|
#define QGEMM_DEFAULT_UNROLL_M 2
|
||||||
|
#define CGEMM_DEFAULT_UNROLL_M 4
|
||||||
|
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||||
|
#define XGEMM_DEFAULT_UNROLL_M 1
|
||||||
|
#define CGEMM3M_DEFAULT_UNROLL_N 4
|
||||||
|
#define CGEMM3M_DEFAULT_UNROLL_M 8
|
||||||
|
#define ZGEMM3M_DEFAULT_UNROLL_N 4
|
||||||
|
#define ZGEMM3M_DEFAULT_UNROLL_M 4
|
||||||
|
#define GEMV_UNROLL 8
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
|
#if defined(ARCH_X86_64)
|
||||||
|
#define SGEMM_DEFAULT_P 768
|
||||||
|
#define DGEMM_DEFAULT_P 384
|
||||||
|
#else
|
||||||
|
#define SGEMM_DEFAULT_P 448
|
||||||
|
#define DGEMM_DEFAULT_P 224
|
||||||
|
#endif
|
||||||
|
#define QGEMM_DEFAULT_P 112
|
||||||
|
#define CGEMM_DEFAULT_P 224
|
||||||
|
#define ZGEMM_DEFAULT_P 112
|
||||||
|
#define XGEMM_DEFAULT_P 56
|
||||||
|
|
||||||
|
#if defined(ARCH_X86_64)
|
||||||
|
#define SGEMM_DEFAULT_Q 168
|
||||||
|
#define DGEMM_DEFAULT_Q 168
|
||||||
|
#else
|
||||||
|
#define SGEMM_DEFAULT_Q 224
|
||||||
|
#define DGEMM_DEFAULT_Q 224
|
||||||
|
#endif
|
||||||
|
#define QGEMM_DEFAULT_Q 224
|
||||||
|
#define CGEMM_DEFAULT_Q 224
|
||||||
|
#define ZGEMM_DEFAULT_Q 224
|
||||||
|
#define XGEMM_DEFAULT_Q 224
|
||||||
|
|
||||||
|
#define SGEMM_DEFAULT_R sgemm_r
|
||||||
|
#define QGEMM_DEFAULT_R qgemm_r
|
||||||
|
#define DGEMM_DEFAULT_R dgemm_r
|
||||||
|
#define CGEMM_DEFAULT_R cgemm_r
|
||||||
|
#define ZGEMM_DEFAULT_R zgemm_r
|
||||||
|
#define XGEMM_DEFAULT_R xgemm_r
|
||||||
|
|
||||||
|
#define SYMV_P 16
|
||||||
|
#define HAVE_EXCLUSIVE_CACHE
|
||||||
|
|
||||||
|
#define GEMM_THREAD gemm_thread_mn
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef PILEDRIVER
|
||||||
|
|
||||||
|
#define SNUMOPT 8
|
||||||
|
#define DNUMOPT 4
|
||||||
|
|
||||||
|
#define GEMM_DEFAULT_OFFSET_A 64
|
||||||
|
#define GEMM_DEFAULT_OFFSET_B 832
|
||||||
|
#define GEMM_DEFAULT_ALIGN 0x0fffUL
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
#define QGEMM_DEFAULT_UNROLL_N 2
|
#define QGEMM_DEFAULT_UNROLL_N 2
|
||||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||||
|
|
Loading…
Reference in New Issue