diff --git a/Makefile.system b/Makefile.system index 239047f36..eac61e961 100644 --- a/Makefile.system +++ b/Makefile.system @@ -277,14 +277,14 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER endif endif diff --git a/TargetList.txt b/TargetList.txt index 1a212e6ca..c859db082 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -29,6 +29,7 @@ BARCELONA SHANGHAI ISTANBUL BOBCAT +BULLDOZER c)VIA CPU: SSE_GENERIC diff --git a/cpuid.h b/cpuid.h index bb57ad92d..c52d503cc 100644 --- a/cpuid.h +++ b/cpuid.h @@ -125,7 +125,8 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) -#define HAVE_AVX (1 << 18) +#define HAVE_AVX (1 << 18) +#define HAVE_FMA4 (1 << 19) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 diff --git a/cpuid_x86.c b/cpuid_x86.c index a19dedeee..317774691 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -43,6 +43,8 @@ #ifdef NO_AVX #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM +#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA +#define CORE_BULLDOZER CORE_BARCELONA #endif #ifndef CPUIDEMU @@ -229,6 +231,9 @@ int get_cputype(int gettype){ cpuid(0x80000001, &eax, &ebx, &ecx, &edx); if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; +#ifndef NO_AVX + if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; +#endif if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; } @@ -1078,8 +1083,12 @@ int get_cpuname(void){ return CPUTYPE_OPTERON; case 1: case 10: - case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series return CPUTYPE_BARCELONA; + case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return CPUTYPE_BULLDOZER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. case 5: return CPUTYPE_BOBCAT; } @@ -1432,8 +1441,13 @@ int get_coretype(void){ if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else if (exfamily == 5) return CORE_BOBCAT; - else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series - else return CORE_BARCELONA; + else if (exfamily == 6) { + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return CORE_BULLDOZER; + else + return CORE_BARCELONA; //OS don't support AVX. Use old kernels. + }else return CORE_BARCELONA; } } @@ -1519,6 +1533,7 @@ void get_cpuconfig(void){ if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); + if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); @@ -1585,5 +1600,6 @@ void get_sse(void){ if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); + if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index b6f27d0ad..6523abb4d 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA; extern gotoblas_t gotoblas_BOBCAT; #ifndef NO_AVX extern gotoblas_t gotoblas_SANDYBRIDGE; +extern gotoblas_t gotoblas_BULLDOZER; #else //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_BULLDOZER gotoblas_BARCELONA #endif @@ -204,6 +206,14 @@ static gotoblas_t *get_coretype(void){ else return &gotoblas_OPTERON; } else if (exfamily == 5) { return &gotoblas_BOBCAT; + } else if (exfamily == 6) { + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return &gotoblas_BULLDOZER; + else{ + fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } } else { return &gotoblas_BARCELONA; } @@ -240,6 +250,7 @@ static char *corename[] = { "Nano", "Sandybridge", "Bobcat", + "Bulldozer", }; char *gotoblas_corename(void) { @@ -261,6 +272,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NANO) return corename[15]; if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; return corename[0]; } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index d261e5a4e..58e5fb11d 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -163,7 +163,7 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; -#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) diff --git a/getarch.c b/getarch.c index 5916a9a04..2b9856338 100644 --- a/getarch.c +++ b/getarch.c @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "OPTERON" #endif -#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) +#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" @@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "BOBCAT" #endif +#if defined (FORCE_BULLDOZER) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BULLDOZER" +#define ARCHCONFIG "-DBULLDOZER " \ + "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ + "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ + "-DHAVE_AVX -DHAVE_FMA4" +#define LIBNAME "bulldozer" +#define CORENAME "BULLDOZER" +#endif + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 4deabdacb..83f2b047f 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -810,6 +810,22 @@ static void init_parameter(void) { #endif #endif +#ifdef BULLDOZER + +#ifdef DEBUG + fprintf(stderr, "Bulldozer\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef NANO #ifdef DEBUG diff --git a/kernel/x86/KERNEL.BULLDOZER b/kernel/x86/KERNEL.BULLDOZER new file mode 100644 index 000000000..231350a62 --- /dev/null +++ b/kernel/x86/KERNEL.BULLDOZER @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_barcelona.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_barcelona.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S diff --git a/kernel/x86/gemm_kernel_4x4_barcelona.S b/kernel/x86/gemm_kernel_4x4_barcelona.S index 18b9a43bd..f081aec2a 100644 --- a/kernel/x86/gemm_kernel_4x4_barcelona.S +++ b/kernel/x86/gemm_kernel_4x4_barcelona.S @@ -596,7 +596,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 @@ -842,7 +842,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1168,7 +1168,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1198,7 +1198,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1347,7 +1347,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -1531,7 +1531,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -1778,7 +1778,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -1793,7 +1793,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -1924,7 +1924,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -2069,7 +2069,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/scal_sse.S b/kernel/x86/scal_sse.S index aa5ab760e..48edfc585 100644 --- a/kernel/x86/scal_sse.S +++ b/kernel/x86/scal_sse.S @@ -269,7 +269,7 @@ sarl $5, I jle .L113 -#if defined(BARCELONA) +#if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 diff --git a/kernel/x86/scal_sse2.S b/kernel/x86/scal_sse2.S index 67c1f437b..35b79132c 100644 --- a/kernel/x86/scal_sse2.S +++ b/kernel/x86/scal_sse2.S @@ -253,7 +253,7 @@ sarl $4, I jle .L113 -#if defined(BARCELONA) +#if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S index 2b6877a31..036e17338 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -439,7 +439,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -488,7 +488,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1697,7 +1697,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1727,7 +1727,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S index 82bb1d3ec..84da443a8 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -437,7 +437,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -833,7 +833,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1848,7 +1848,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2109,7 +2109,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2429,7 +2429,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -2459,7 +2459,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2952,7 +2952,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -3148,7 +3148,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3389,7 +3389,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -3404,7 +3404,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S index d81177b7e..0bd924cba 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -910,7 +910,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -959,7 +959,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1439,7 +1439,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1469,7 +1469,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S index 854c44e7a..de7c04593 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -872,7 +872,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1316,7 +1316,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1855,7 +1855,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1885,7 +1885,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2249,7 +2249,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2562,7 +2562,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2957,7 +2957,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -2972,7 +2972,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -3280,7 +3280,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3515,7 +3515,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S index f7a08c699..f5d5ad465 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -1036,7 +1036,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1066,7 +1066,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 @@ -2224,7 +2224,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -2273,7 +2273,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S index 80dc2451c..5c2dcd0d6 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -439,7 +439,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -454,7 +454,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -758,7 +758,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -993,7 +993,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -1324,7 +1324,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1354,7 +1354,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1718,7 +1718,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2031,7 +2031,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2859,7 +2859,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -3303,7 +3303,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 diff --git a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S index 29158df25..623f0beec 100644 --- a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S +++ b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S @@ -74,7 +74,7 @@ #define BB %ecx #define LDC %ebp -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define movsd movlps #endif @@ -625,7 +625,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 @@ -870,7 +870,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1173,7 +1173,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1203,7 +1203,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1359,7 +1359,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -1536,7 +1536,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -1794,7 +1794,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -1809,7 +1809,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -1936,7 +1936,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -2069,7 +2069,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S index 8e28bb8e6..0087ac6f4 100644 --- a/kernel/x86/zgemv_n_sse.S +++ b/kernel/x86/zgemv_n_sse.S @@ -71,7 +71,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S index 607c51de0..f0f2dc0ec 100644 --- a/kernel/x86/zgemv_n_sse2.S +++ b/kernel/x86/zgemv_n_sse2.S @@ -58,7 +58,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S index fb98226ee..c7ad91235 100644 --- a/kernel/x86/zgemv_t_sse.S +++ b/kernel/x86/zgemv_t_sse.S @@ -71,7 +71,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S index e2f391a82..6c4842893 100644 --- a/kernel/x86/zgemv_t_sse2.S +++ b/kernel/x86/zgemv_t_sse2.S @@ -58,7 +58,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S index ee9eb9d25..d32451574 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -533,7 +533,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S index 9ef572470..9f9449852 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -994,7 +994,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S index cd1bf2f53..dd0c5ab21 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -1820,7 +1820,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER new file mode 100644 index 000000000..d59668519 --- /dev/null +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_barcelona.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S diff --git a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S new file mode 100644 index 000000000..b06b07edf --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S @@ -0,0 +1,1860 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define movapd movaps +#define movupd movups + +#define KERNEL1(xx) \ + vfmaddpd %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vmovaps %xmm2, %xmm0 ;\ + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vfmaddpd %xmm12,%xmm2,%xmm1,%xmm12 ;\ + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm13,%xmm2,%xmm3,%xmm13 ;\ + vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm3,%xmm0,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm0, %xmm2 + +#define KERNEL2(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\ + vmovaps %xmm2, %xmm0 ;\ + vmovups -10 * SIZE(AO, %rax, 4),%xmm2 ;\ +/*A*/ vmovups (AO, %rax, 4), %xmm6 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\ + vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ +/**/ vmovddup (BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL3(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4, %xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4, %xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\ + vmovddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL4(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5 ,%xmm12;\ +/*A*/ vmovups 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + vmovddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ +/**/ vmovddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm6, %xmm2 + +#define KERNEL5(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\ + vmovaps %xmm2, %xmm6 ;\ + vmovups 2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vmovddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\ + vmovups 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + vmovddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm6, %xmm2 + +#define KERNEL6(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\ + vmovaps %xmm2, %xmm6 ;\ + vmovups 6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ +/*A*/ vmovups 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ +/**/ vmovddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\ + vmovddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm7, %xmm2 + +#define KERNEL7(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\ + vmovaps %xmm2, %xmm7 ;\ + vmovups 10 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vmovddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + vmovddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm7, %xmm2 + +#define KERNEL8(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\ + vmovaps %xmm2, %xmm7 ;\ + vmovups 14 * SIZE(AO, %rax, 4),%xmm2 ;\ +/*A*/ vmovups 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\ + vmovddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ + vmovddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ +/**/ vmovddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax ;\ + +#define KERNEL_SUB1(xx) \ + vfmaddpd %xmm8, %xmm1, %xmm0,%xmm8 ;\ + vmovapd %xmm2, %xmm0 ;\ + vmovups -14 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12, %xmm2, %xmm1,%xmm12 ;\ + vmovddup -14 * SIZE(BO), %xmm1 ;\ + vfmaddpd %xmm9, %xmm3, %xmm0,%xmm9 ;\ + vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ + vmovddup -13 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10, %xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1 ,%xmm14 ;\ + vfmaddpd %xmm11, %xmm3, %xmm0,%xmm11 ;\ + vfmaddpd %xmm15, %xmm2, %xmm3,%xmm15 ;\ + vmovups -12 * SIZE(AO), %xmm0 ;\ + vmovddup -12 * SIZE(BO), %xmm1 ;\ + vmovddup -11 * SIZE(BO), %xmm3 ;\ + vmovapd %xmm0, %xmm2 + + +#define KERNEL_SUB2(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\ + vmovaps %xmm2, %xmm0 ;\ + vmovups -10 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -10 * SIZE(BO), %xmm1 ;\ + vmovddup -9 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovups (AO), %xmm0 ;\ + vmovddup (BO), %xmm1 ;\ + vmovddup -7 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -6 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -6 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -5 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vmovups -4 * SIZE(AO), %xmm4 ;\ + vmovddup -4 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup -3 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -2 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -2 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -1 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 1 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 2), CO2 # coffset2 = c + ldc + + leaq (C, LDC, 4), C # c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + + .align 16 +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + vzeroall + prefetcht0 256(CO1) + prefetcht0 320(CO1) + prefetcht0 256(CO2) + prefetcht0 320(CO2) + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -16 * SIZE(BO), %xmm1 + vmovddup -15 * SIZE(BO), %xmm3 + vmovups -8 * SIZE(AO), %xmm4 + vmovddup -8 * SIZE(BO), %xmm5 + + vmovaps %xmm0, %xmm2 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + + .align 16 +.L12: + prefetcht0 (AO,%rax,4) + prefetcht0 (BO,%rax,4) + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + jl .L12 + ALIGN_4 + +.L15: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vmovaps %xmm2, %xmm0 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 + vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 + vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 + vmovaps %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + // prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12,%xmm12 + .align 2 + vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm7, %xmm13,%xmm13 + .align 2 + vfmaddpd (CO2),%xmm7, %xmm10,%xmm10 + vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm14,%xmm14 + .align 2 + vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11 + vfmaddpd 2 * SIZE(CO2, LDC),%xmm7, %xmm15,%xmm15 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm12,%xmm12 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm13,%xmm13 + vmulpd %xmm7, %xmm10,%xmm10 + vmulpd %xmm7, %xmm14,%xmm14 + vmulpd %xmm7, %xmm11,%xmm11 + vmulpd %xmm7, %xmm15,%xmm15 + +#endif + + .align 2 + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + .align 2 + vmovups %xmm9, (CO1, LDC) + vmovups %xmm13, 2 * SIZE(CO1, LDC) + .align 2 + vmovups %xmm10, (CO2) + vmovups %xmm14, 2 * SIZE(CO2) + .align 2 + vmovups %xmm11, (CO2, LDC) + vmovups %xmm15, 2 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8, %xmm8 + vmovups -12 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9 ,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm10, %xmm10, %xmm10 + vmovddup -15 * SIZE(BO), %xmm5 + vxorps %xmm11, %xmm11, %xmm11 + vmovddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -9 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovddup (BO, %rax, 4), %xmm1 + vmovddup -7 * SIZE(BO, %rax, 4), %xmm5 + vmovups -8 * SIZE(AO, %rax, 2), %xmm0 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9 + vmovddup -6 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -5 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11 + vmovups -10 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -4 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -3 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9 + vmovddup -2 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -1 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11 + vmovddup 8 * SIZE(BO, %rax, 4), %xmm3 + vmovups -4 * SIZE(AO, %rax, 2), %xmm2 + vmovddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9 + vfmaddpd (CO2),%xmm7, %xmm10,%xmm10 + vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm10,%xmm10 + vmulpd %xmm7, %xmm11,%xmm11 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm9, (CO1, LDC) + + vmovups %xmm10, (CO2) + vmovups %xmm11, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + vmovddup -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8, %xmm8 + vmovddup -14 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9, %xmm9 + vmovddup -15 * SIZE(AO), %xmm4 + vxorps %xmm10, %xmm10,%xmm10 + vmovups -16 * SIZE(BO), %xmm1 + vxorps %xmm11, %xmm11,%xmm11 + vmovups -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9 + vmovups -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -12 * SIZE(AO, %rax, 1), %xmm0 + vfmaddpd %xmm10,%xmm4, %xmm1,%xmm10 + vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 4), %xmm4,%xmm11 + vmovups (BO, %rax, 4), %xmm1 + vmovddup -11 * SIZE(AO, %rax, 1), %xmm4 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,-6 * SIZE(BO, %rax, 4), %xmm2,%xmm9 + vmovups -4 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -13 * SIZE(AO, %rax, 1), %xmm2 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,-2 * SIZE(BO, %rax, 4), %xmm2,%xmm11 + vmovups 8 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9 + vmovups -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + vaddpd %xmm10, %xmm8,%xmm8 + vaddpd %xmm11, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 + vmovhpd (CO1, LDC), %xmm0,%xmm0 + vmovsd (CO2), %xmm1 + vmovhpd (CO2, LDC), %xmm1,%xmm1 + + + vfmaddpd %xmm0, %xmm7,%xmm8,%xmm8 + vfmaddpd %xmm1, %xmm7,%xmm9,%xmm9 +#else + + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + +#endif + + vmovsd %xmm8, (CO1) + vmovhpd %xmm8, (CO1, LDC) + vmovsd %xmm9, (CO2) + vmovhpd %xmm9, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + vmovddup -16 * SIZE(BO), %xmm1 + vmovddup -15 * SIZE(BO), %xmm5 + vmovddup -12 * SIZE(BO), %xmm3 + vxorps %xmm8, %xmm8,%xmm8 + vxorps %xmm9, %xmm9,%xmm9 + vxorps %xmm12, %xmm12,%xmm12 + vxorps %xmm13, %xmm13,%xmm13 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -8 * SIZE(AO), %xmm4 + // prefetcht0 256(CO1) + // prefetcht0 320(CO1) + // prefetcht0 256(CO2) + // prefetcht0 320(CO2) + // prefetchnta 24 * SIZE(CO1) + // prefetchnta 32 * SIZE(CO1) + // prefetchw 3 * SIZE(CO1) + vmovups %xmm0, %xmm2 + // prefetchw 3 * SIZE(CO2) + // prefetchnta -16 * SIZE(BB) + // prefetch -16 * SIZE(BB) + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 + vmovups -10 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 + vmovups (AO, %rax, 4), %xmm0 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -11 * SIZE(BO, %rax, 2), %xmm5 + vmovups -6 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 + vmovddup -10 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -9 * SIZE(BO, %rax, 2), %xmm5 + vmovups -2 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovups 8 * SIZE(AO, %rax, 4), %xmm4 + vmovddup -4 * SIZE(BO, %rax, 2), %xmm3 + vmovddup -7 * SIZE(BO, %rax, 2), %xmm5 + vmovaps %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 + vmovaps %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#ifndef TRMMKERNEL + vfmaddpd (CO1),%xmm7, %xmm8, %xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12, %xmm12 + vfmaddpd (CO2),%xmm7, %xmm9, %xmm9 + vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm13, %xmm13 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm12,%xmm12 + vmulpd %xmm7, %xmm13,%xmm13 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + + vmovups %xmm9, (CO2) + vmovups %xmm13, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovups -12 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -15 * SIZE(BO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -12 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 + vmovups -8 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -11 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm8,%xmm2, %xmm1,%xmm8 + vmovddup -10 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm2, %xmm3,%xmm9 + vmovups -10 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -9 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm10,%xmm2, %xmm1,%xmm10 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm11,%xmm2, %xmm3,%xmm11 + vmovups -4 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + vaddpd %xmm10, %xmm8,%xmm8 + vaddpd %xmm11, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd (CO2),%xmm7, %xmm9,%xmm9 + +#else + + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm9, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + vmovddup -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovddup -15 * SIZE(AO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -14 * SIZE(AO), %xmm2 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -13 * SIZE(AO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8 + vmovddup -12 * SIZE(AO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 2), %xmm1,%xmm9 + vmovddup -11 * SIZE(AO, %rax, 1), %xmm1 + + vfmaddpd %xmm10,-12 * SIZE(BO, %rax, 2), %xmm2,%xmm10 + vmovddup -10 * SIZE(AO, %rax, 1), %xmm2 + + vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 2), %xmm3,%xmm11 + vmovddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8 + vmovddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm11, %xmm10,%xmm10 + vaddpd %xmm10, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 + vmovhpd (CO2), %xmm0,%xmm0 +#endif + + vmulpd %xmm7, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vaddpd %xmm0, %xmm8,%xmm8 +#endif + + vmovsd %xmm8, (CO1) + vmovhpd %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + vmovups -8 * SIZE(AO), %xmm2 + vxorps %xmm8, %xmm8,%xmm8 + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm12, %xmm12,%xmm12 + vmovddup -14 * SIZE(BO), %xmm3 + vxorps %xmm13, %xmm13,%xmm13 + vmovddup -15 * SIZE(BO), %xmm5 + + // prefetchw 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12 + vmovapd -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 1), %xmm1 + vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 + vfmaddpd %xmm13,-10 * SIZE(AO, %rax, 4), %xmm5,%xmm13 + vmovapd (AO, %rax, 4), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 1), %xmm5 + vfmaddpd %xmm8,%xmm3, %xmm2,%xmm8 + vfmaddpd %xmm12,-6 * SIZE(AO, %rax, 4), %xmm3,%xmm12 + vmovapd -4 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -10 * SIZE(BO, %rax, 1), %xmm3 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vfmaddpd %xmm13,-2 * SIZE(AO, %rax, 4), %xmm5,%xmm13 + vmovapd 8 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 + +.L99: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm13, %xmm12,%xmm12 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7,%xmm12,%xmm12 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm12,%xmm12 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + vmovddup -16 * SIZE(BO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovddup -15 * SIZE(BO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -14 * SIZE(BO), %xmm2 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -13 * SIZE(BO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8 + vmovddup -12 * SIZE(BO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(AO, %rax, 2), %xmm1,%xmm9 + vmovddup -11 * SIZE(BO, %rax, 1), %xmm1 + + vfmaddpd %xmm10,-12 * SIZE(AO, %rax, 2), %xmm2,%xmm10 + vmovddup -10 * SIZE(BO, %rax, 1), %xmm2 + + vfmaddpd %xmm11,-10 * SIZE(AO, %rax, 2), %xmm3,%xmm11 + vmovddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + vmovddup -16 * SIZE(BO, %rax, 1), %xmm0 + vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm11, %xmm10,%xmm10 + vaddpd %xmm10, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 +#else + vmulpd %xmm7, %xmm8,%xmm8 + +#endif + + vmovups %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + movups -14 * SIZE(AO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 1), %xmm0,%xmm8 + vmovups -12 * SIZE(AO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 1), %xmm1,%xmm9 + vmovups -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + vmulsd -16 * SIZE(BO, %rax, 1), %xmm0,%xmm0 + vaddsd %xmm0, %xmm8,%xmm8 + vmovsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + vaddpd %xmm9, %xmm8,%xmm8 + vhaddpd %xmm8, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 +#endif + + vmulsd %xmm7, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vaddsd %xmm0, %xmm8,%xmm8 +#endif + + vmovsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE diff --git a/kernel/x86_64/gemm_kernel_8x4_barcelona.S b/kernel/x86_64/gemm_kernel_8x4_barcelona.S index b40c8bac7..becd19544 100644 --- a/kernel/x86_64/gemm_kernel_8x4_barcelona.S +++ b/kernel/x86_64/gemm_kernel_8x4_barcelona.S @@ -930,7 +930,7 @@ .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 @@ -983,7 +983,7 @@ addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 @@ -1178,7 +1178,7 @@ .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 @@ -1423,7 +1423,7 @@ .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 @@ -1765,7 +1765,7 @@ .L62: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 @@ -1793,7 +1793,7 @@ addps %xmm8, %xmm5 movaps 32 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 @@ -1822,7 +1822,7 @@ addps %xmm10, %xmm5 movaps 48 * SIZE(AO), %xmm10 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 @@ -1851,7 +1851,7 @@ addps %xmm12, %xmm5 movaps 64 * SIZE(AO), %xmm12 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 @@ -2024,7 +2024,7 @@ .L72: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif @@ -2208,7 +2208,7 @@ .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 @@ -2395,7 +2395,7 @@ .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 @@ -2670,7 +2670,7 @@ .L112: mulps %xmm9, %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif @@ -2687,7 +2687,7 @@ addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 @@ -2704,7 +2704,7 @@ addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 @@ -2721,7 +2721,7 @@ addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 @@ -2857,7 +2857,7 @@ .L122: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps -28 * SIZE(AO), %xmm8 @@ -2873,7 +2873,7 @@ addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 @@ -3003,7 +3003,7 @@ .L132: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd -30 * SIZE(AO), %xmm8 @@ -3150,7 +3150,7 @@ .L142: mulss %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss -31 * SIZE(AO), %xmm8 diff --git a/kernel/x86_64/gemm_ncopy_4_opteron.S b/kernel/x86_64/gemm_ncopy_4_opteron.S index edde7e2c1..e5cbd62eb 100644 --- a/kernel/x86_64/gemm_ncopy_4_opteron.S +++ b/kernel/x86_64/gemm_ncopy_4_opteron.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (48 + 4) #define MOVNTQ MOVQ @@ -79,7 +79,7 @@ #define AO3 %r13 #define AO4 %rax -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch diff --git a/kernel/x86_64/gemm_tcopy_4_opteron.S b/kernel/x86_64/gemm_tcopy_4_opteron.S index 459eeb8c5..105fe3b47 100644 --- a/kernel/x86_64/gemm_tcopy_4_opteron.S +++ b/kernel/x86_64/gemm_tcopy_4_opteron.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (12 + 4) #define MOVNTQ MOVQ @@ -96,7 +96,7 @@ #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch diff --git a/kernel/x86_64/izamax_sse2.S b/kernel/x86_64/izamax_sse2.S index 4e66e5338..404608256 100644 --- a/kernel/x86_64/izamax_sse2.S +++ b/kernel/x86_64/izamax_sse2.S @@ -469,7 +469,7 @@ ALIGN_4 .L71: -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) prefetch PREFETCHSIZE * SIZE(X) #endif diff --git a/kernel/x86_64/scal_sse.S b/kernel/x86_64/scal_sse.S index 323e8b9dd..9c8dd9dc2 100644 --- a/kernel/x86_64/scal_sse.S +++ b/kernel/x86_64/scal_sse.S @@ -266,7 +266,7 @@ sarq $5, I jle .L113 -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S index 8f5612081..3823b1fc9 100644 --- a/kernel/x86_64/scal_sse2.S +++ b/kernel/x86_64/scal_sse2.S @@ -251,7 +251,7 @@ sarq $4, I jle .L113 -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 5a123d7f6..0f1ebd564 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index 8afdc87db..9dd123c52 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 5aef6b461..93a66aaa7 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index fa1bfba85..f412b3e2f 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S index 513572ee9..552dbacdc 100644 --- a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S index 526a78c57..7727fd591 100644 --- a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S index e96496fd6..699364941 100644 --- a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/zgemm_ncopy_2.S b/kernel/x86_64/zgemm_ncopy_2.S index bf318b7ff..8876b61ff 100644 --- a/kernel/x86_64/zgemm_ncopy_2.S +++ b/kernel/x86_64/zgemm_ncopy_2.S @@ -85,7 +85,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE 32 #define WPREFETCHSIZE 48 #endif diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 6af65a4ba..dcfe83189 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -160,7 +160,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 71aca0198..04605e3cb 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -167,7 +167,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 4b8422d82..e8b01ad7a 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -166,7 +166,7 @@ #define xt1 %xmm14 #define xt2 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 33667f79e..40246e52e 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -166,7 +166,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S index b8caa9a44..79f20b641 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S index 2db8cbc5d..f5c100ec1 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S index 16c9ca828..18edeed57 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S index dbdbfe2e1..f58cecdf5 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S index 181cdd29c..1b589e0cf 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S index c28d02927..2c47ce3fd 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/l1param.h b/l1param.h index 6fe756f17..0b216c7c5 100644 --- a/l1param.h +++ b/l1param.h @@ -74,6 +74,13 @@ #define ALIGNED_ACCESS #endif +#ifdef BULLDOZER +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/l2param.h b/l2param.h index cdbd8805e..01fe7943d 100644 --- a/l2param.h +++ b/l2param.h @@ -85,7 +85,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps diff --git a/param.h b/param.h index 11c1a269e..5b6a19ad5 100644 --- a/param.h +++ b/param.h @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define SNUMOPT 8 #define DNUMOPT 4