From 7110d171468cf8fe11463c3a3bcd5cc4cef54868 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 28 Nov 2012 12:52:28 +0800 Subject: [PATCH 01/30] Added -lgomp for generating DLL on Windows. --- exports/Makefile | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/exports/Makefile b/exports/Makefile index 5219560ee..15041be86 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -22,6 +22,11 @@ ifeq ($(OSNAME), WINNT) ifeq ($(F_COMPILER), GFORTRAN) EXTRALIB += -lgfortran endif +ifeq ($(USE_OPENMP), 1) +ifeq ($(C_COMPILER), GCC) +EXTRALIB += -lgomp +endif +endif endif ifeq ($(OSNAME), CYGWIN_NT) From b7c0fa6bd223085f1b7ddade0bef487bd5c15688 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 6 Dec 2012 07:29:54 -0500 Subject: [PATCH 02/30] Init AMD Bulldozer codebase. --- Makefile.system | 4 +-- cpuid.h | 3 +- cpuid_x86.c | 20 +++++++++-- driver/others/dynamic.c | 12 +++++++ getarch.c | 18 +++++++++- kernel/x86/KERNEL.BULLDOZER | 59 ++++++++++++++++++++++++++++++++ kernel/x86_64/KERNEL.BULLDOZER | 62 ++++++++++++++++++++++++++++++++++ param.h | 2 +- 8 files changed, 172 insertions(+), 8 deletions(-) create mode 100644 kernel/x86/KERNEL.BULLDOZER create mode 100644 kernel/x86_64/KERNEL.BULLDOZER diff --git a/Makefile.system b/Makefile.system index 27f30fa61..75c0e0ad4 100644 --- a/Makefile.system +++ b/Makefile.system @@ -277,14 +277,14 @@ ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER endif endif ifeq ($(ARCH), x86_64) DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO ifneq ($(NO_AVX), 1) -DYNAMIC_CORE += SANDYBRIDGE +DYNAMIC_CORE += SANDYBRIDGE BULLDOZER endif endif diff --git a/cpuid.h b/cpuid.h index bb57ad92d..c52d503cc 100644 --- a/cpuid.h +++ b/cpuid.h @@ -125,7 +125,8 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) -#define HAVE_AVX (1 << 18) +#define HAVE_AVX (1 << 18) +#define HAVE_FMA4 (1 << 19) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 diff --git a/cpuid_x86.c b/cpuid_x86.c index 6e4eae20d..afc3b17b7 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -43,6 +43,8 @@ #ifdef NO_AVX #define CPUTYPE_SANDYBRIDGE CPUTYPE_NEHALEM #define CORE_SANDYBRIDGE CORE_NEHALEM +#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA +#define CORE_BULLDOZER CORE_BARCELONA #endif #ifndef CPUIDEMU @@ -228,6 +230,9 @@ int get_cputype(int gettype){ cpuid(0x80000001, &eax, &ebx, &ecx, &edx); if ((ecx & (1 << 6)) != 0) feature |= HAVE_SSE4A; if ((ecx & (1 << 7)) != 0) feature |= HAVE_MISALIGNSSE; +#ifndef NO_AVX + if ((ecx & (1 << 16)) != 0) feature |= HAVE_FMA4; +#endif if ((edx & (1 << 30)) != 0) feature |= HAVE_3DNOWEX; if ((edx & (1 << 31)) != 0) feature |= HAVE_3DNOW; } @@ -1075,8 +1080,12 @@ int get_cpuname(void){ return CPUTYPE_OPTERON; case 1: case 10: - case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series return CPUTYPE_BARCELONA; + case 6: //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return CPUTYPE_BULLDOZER; + else + return CPUTYPE_BARCELONA; //OS don't support AVX. case 5: return CPUTYPE_BOBCAT; } @@ -1427,8 +1436,13 @@ int get_coretype(void){ if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else if (exfamily == 5) return CORE_BOBCAT; - else if (exfamily == 6) return CORE_BARCELONA; //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series - else return CORE_BARCELONA; + else if (exfamily == 6) { + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return CORE_BULLDOZER; + else + return CORE_BARCELONA; //OS don't support AVX. Use old kernels. + }else return CORE_BARCELONA; } } diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 5d2bc782f..1c0e1d3bb 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -63,9 +63,11 @@ extern gotoblas_t gotoblas_BARCELONA; extern gotoblas_t gotoblas_BOBCAT; #ifndef NO_AVX extern gotoblas_t gotoblas_SANDYBRIDGE; +extern gotoblas_t gotoblas_BULLDOZER; #else //Use NEHALEM kernels for sandy bridge #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM +#define gotoblas_BULLDOZER gotoblas_BARCELONA #endif @@ -202,6 +204,14 @@ static gotoblas_t *get_coretype(void){ else return &gotoblas_OPTERON; } else if (exfamily == 5) { return &gotoblas_BOBCAT; + } else if (exfamily == 6) { + //AMD Bulldozer Opteron 6200 / Opteron 4200 / AMD FX-Series + if(support_avx()) + return &gotoblas_BULLDOZER; + else{ + fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); + return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. + } } else { return &gotoblas_BARCELONA; } @@ -238,6 +248,7 @@ static char *corename[] = { "Nano", "Sandybridge", "Bobcat", + "Bulldozer", }; char *gotoblas_corename(void) { @@ -259,6 +270,7 @@ char *gotoblas_corename(void) { if (gotoblas == &gotoblas_NANO) return corename[15]; if (gotoblas == &gotoblas_SANDYBRIDGE) return corename[16]; if (gotoblas == &gotoblas_BOBCAT) return corename[17]; + if (gotoblas == &gotoblas_BULLDOZER) return corename[18]; return corename[0]; } diff --git a/getarch.c b/getarch.c index 5916a9a04..4daf260f0 100644 --- a/getarch.c +++ b/getarch.c @@ -350,7 +350,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "OPTERON" #endif -#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_BULLDOZER) +#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" @@ -380,6 +380,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "BOBCAT" #endif +#if defined (FORCE_BULLDOZER) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BULLDOZER" +#define ARCHCONFIG "-DBARCELONA " \ + "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ + "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \ + "-DHAVE_AVX -DHAVE_FMA4" +#define LIBNAME "bulldozer" +#define CORENAME "BULLDOZER" +#endif + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL diff --git a/kernel/x86/KERNEL.BULLDOZER b/kernel/x86/KERNEL.BULLDOZER new file mode 100644 index 000000000..231350a62 --- /dev/null +++ b/kernel/x86/KERNEL.BULLDOZER @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_barcelona.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_barcelona.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER new file mode 100644 index 000000000..051a52286 --- /dev/null +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_barcelona.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_barcelona.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S diff --git a/param.h b/param.h index 11c1a269e..5b6a19ad5 100644 --- a/param.h +++ b/param.h @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define SNUMOPT 8 #define DNUMOPT 4 From bfaaa975e6789acbce20384d01bd34b122832d18 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 7 Dec 2012 00:53:31 +0800 Subject: [PATCH 03/30] Added BULLDOZER target. So far it uses barcelona kernels. --- TargetList.txt | 1 + driver/others/parameter.c | 2 +- getarch.c | 4 +-- kernel/setparam-ref.c | 16 ++++++++++ kernel/x86/gemm_kernel_4x4_barcelona.S | 20 ++++++------ kernel/x86/scal_sse.S | 2 +- kernel/x86/scal_sse2.S | 2 +- kernel/x86/trsm_kernel_LN_2x4_sse2.S | 10 +++--- kernel/x86/trsm_kernel_LN_4x4_sse.S | 22 ++++++------- kernel/x86/trsm_kernel_LT_2x4_sse2.S | 10 +++--- kernel/x86/trsm_kernel_LT_4x4_sse.S | 22 ++++++------- kernel/x86/trsm_kernel_RT_2x4_sse2.S | 10 +++--- kernel/x86/trsm_kernel_RT_4x4_sse.S | 22 ++++++------- kernel/x86/zgemm3m_kernel_4x4_barcelona.S | 22 ++++++------- kernel/x86/zgemv_n_sse.S | 2 +- kernel/x86/zgemv_n_sse2.S | 2 +- kernel/x86/zgemv_t_sse.S | 2 +- kernel/x86/zgemv_t_sse2.S | 2 +- kernel/x86/ztrsm_kernel_LN_2x2_sse.S | 4 +-- kernel/x86/ztrsm_kernel_LT_2x2_sse.S | 4 +-- kernel/x86/ztrsm_kernel_RT_2x2_sse.S | 4 +-- kernel/x86_64/gemm_kernel_8x4_barcelona.S | 38 +++++++++++------------ kernel/x86_64/gemm_ncopy_4_opteron.S | 4 +-- kernel/x86_64/gemm_tcopy_4_opteron.S | 4 +-- kernel/x86_64/izamax_sse2.S | 2 +- kernel/x86_64/scal_sse.S | 2 +- kernel/x86_64/scal_sse2.S | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/trsm_kernel_LN_8x4_sse.S | 2 +- kernel/x86_64/trsm_kernel_LT_8x4_sse.S | 2 +- kernel/x86_64/trsm_kernel_RT_8x4_sse.S | 2 +- kernel/x86_64/zgemm_ncopy_2.S | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 4 +-- kernel/x86_64/zsymv_U_sse.S | 4 +-- kernel/x86_64/zsymv_U_sse2.S | 4 +-- kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S | 2 +- kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S | 2 +- kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S | 2 +- l1param.h | 7 +++++ l2param.h | 2 +- 47 files changed, 156 insertions(+), 132 deletions(-) diff --git a/TargetList.txt b/TargetList.txt index 1a212e6ca..c859db082 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -29,6 +29,7 @@ BARCELONA SHANGHAI ISTANBUL BOBCAT +BULLDOZER c)VIA CPU: SSE_GENERIC diff --git a/driver/others/parameter.c b/driver/others/parameter.c index d261e5a4e..58e5fb11d 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -163,7 +163,7 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; -#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) diff --git a/getarch.c b/getarch.c index 4daf260f0..2b9856338 100644 --- a/getarch.c +++ b/getarch.c @@ -385,12 +385,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define FORCE_INTEL #define ARCHITECTURE "X86" #define SUBARCHITECTURE "BULLDOZER" -#define ARCHCONFIG "-DBARCELONA " \ +#define ARCHCONFIG "-DBULLDOZER " \ "-DL1_DATA_SIZE=49152 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=1024000 -DL2_LINESIZE=64 -DL3_SIZE=16777216 " \ "-DDTB_DEFAULT_ENTRIES=32 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 " \ - "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU" \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_128BITFPU -DHAVE_FASTMOVU " \ "-DHAVE_AVX -DHAVE_FMA4" #define LIBNAME "bulldozer" #define CORENAME "BULLDOZER" diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index f57b425e6..e8db76871 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -810,6 +810,22 @@ static void init_parameter(void) { #endif #endif +#ifdef BULLDOZER + +#ifdef DEBUG + fprintf(stderr, "Bulldozer\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef NANO #ifdef DEBUG diff --git a/kernel/x86/gemm_kernel_4x4_barcelona.S b/kernel/x86/gemm_kernel_4x4_barcelona.S index 18b9a43bd..f081aec2a 100644 --- a/kernel/x86/gemm_kernel_4x4_barcelona.S +++ b/kernel/x86/gemm_kernel_4x4_barcelona.S @@ -596,7 +596,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 @@ -842,7 +842,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1168,7 +1168,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1198,7 +1198,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1347,7 +1347,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -1531,7 +1531,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -1778,7 +1778,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -1793,7 +1793,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -1924,7 +1924,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -2069,7 +2069,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/scal_sse.S b/kernel/x86/scal_sse.S index aa5ab760e..48edfc585 100644 --- a/kernel/x86/scal_sse.S +++ b/kernel/x86/scal_sse.S @@ -269,7 +269,7 @@ sarl $5, I jle .L113 -#if defined(BARCELONA) +#if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 diff --git a/kernel/x86/scal_sse2.S b/kernel/x86/scal_sse2.S index 67c1f437b..35b79132c 100644 --- a/kernel/x86/scal_sse2.S +++ b/kernel/x86/scal_sse2.S @@ -253,7 +253,7 @@ sarl $4, I jle .L113 -#if defined(BARCELONA) +#if defined(BARCELONA) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S index 2b6877a31..036e17338 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -439,7 +439,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -488,7 +488,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1697,7 +1697,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1727,7 +1727,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S index 82bb1d3ec..84da443a8 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -437,7 +437,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -833,7 +833,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1848,7 +1848,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2109,7 +2109,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2429,7 +2429,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -2459,7 +2459,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2952,7 +2952,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -3148,7 +3148,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3389,7 +3389,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -3404,7 +3404,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S index d81177b7e..0bd924cba 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -910,7 +910,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -959,7 +959,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1439,7 +1439,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1469,7 +1469,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S index 854c44e7a..de7c04593 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -872,7 +872,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1316,7 +1316,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1855,7 +1855,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1885,7 +1885,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2249,7 +2249,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2562,7 +2562,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2957,7 +2957,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -2972,7 +2972,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -3280,7 +3280,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3515,7 +3515,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S index f7a08c699..f5d5ad465 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -1036,7 +1036,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1066,7 +1066,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 @@ -2224,7 +2224,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -2273,7 +2273,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S index 80dc2451c..5c2dcd0d6 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -439,7 +439,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -454,7 +454,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -758,7 +758,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -993,7 +993,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -1324,7 +1324,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1354,7 +1354,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1718,7 +1718,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2031,7 +2031,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2859,7 +2859,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -3303,7 +3303,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 diff --git a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S index 29158df25..623f0beec 100644 --- a/kernel/x86/zgemm3m_kernel_4x4_barcelona.S +++ b/kernel/x86/zgemm3m_kernel_4x4_barcelona.S @@ -74,7 +74,7 @@ #define BB %ecx #define LDC %ebp -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define movsd movlps #endif @@ -625,7 +625,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 4 * SIZE(BB), %xmm2 @@ -870,7 +870,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1173,7 +1173,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1203,7 +1203,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1359,7 +1359,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -1536,7 +1536,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -1794,7 +1794,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -1809,7 +1809,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -1936,7 +1936,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -2069,7 +2069,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S index 8e28bb8e6..0087ac6f4 100644 --- a/kernel/x86/zgemv_n_sse.S +++ b/kernel/x86/zgemv_n_sse.S @@ -71,7 +71,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S index 607c51de0..f0f2dc0ec 100644 --- a/kernel/x86/zgemv_n_sse2.S +++ b/kernel/x86/zgemv_n_sse2.S @@ -58,7 +58,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S index fb98226ee..c7ad91235 100644 --- a/kernel/x86/zgemv_t_sse.S +++ b/kernel/x86/zgemv_t_sse.S @@ -71,7 +71,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 5) diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S index e2f391a82..6c4842893 100644 --- a/kernel/x86/zgemv_t_sse2.S +++ b/kernel/x86/zgemv_t_sse2.S @@ -58,7 +58,7 @@ #define movsd movlps #endif -#ifdef BARCELONA +#if defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetchnta #define PREFETCHW prefetchw #define PREFETCHSIZE (8 * 5) diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S index ee9eb9d25..d32451574 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -533,7 +533,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S index 9ef572470..9f9449852 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -994,7 +994,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S index cd1bf2f53..dd0c5ab21 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -1820,7 +1820,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || defined(BULLDOZER) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86_64/gemm_kernel_8x4_barcelona.S b/kernel/x86_64/gemm_kernel_8x4_barcelona.S index b40c8bac7..becd19544 100644 --- a/kernel/x86_64/gemm_kernel_8x4_barcelona.S +++ b/kernel/x86_64/gemm_kernel_8x4_barcelona.S @@ -930,7 +930,7 @@ .L22: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps 4 * SIZE(BO), %xmm9 @@ -983,7 +983,7 @@ addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm9 @@ -1178,7 +1178,7 @@ .L32: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 @@ -1423,7 +1423,7 @@ .L42: mulss %xmm8, %xmm9 addss %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 @@ -1765,7 +1765,7 @@ .L62: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif mulps 4 * SIZE(BO), %xmm8 @@ -1793,7 +1793,7 @@ addps %xmm8, %xmm5 movaps 32 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 @@ -1822,7 +1822,7 @@ addps %xmm10, %xmm5 movaps 48 * SIZE(AO), %xmm10 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm12, %xmm13 @@ -1851,7 +1851,7 @@ addps %xmm12, %xmm5 movaps 64 * SIZE(AO), %xmm12 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm14, %xmm15 @@ -2024,7 +2024,7 @@ .L72: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif @@ -2208,7 +2208,7 @@ .L82: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd 4 * SIZE(BO), %xmm9 @@ -2395,7 +2395,7 @@ .L92: mulps %xmm8, %xmm9 addps %xmm9, %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss 4 * SIZE(BO), %xmm9 @@ -2670,7 +2670,7 @@ .L112: mulps %xmm9, %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif @@ -2687,7 +2687,7 @@ addps %xmm9, %xmm4 movaps 8 * SIZE(BO), %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm9, %xmm10 @@ -2704,7 +2704,7 @@ addps %xmm9, %xmm4 movaps 32 * SIZE(BO), %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 32) * SIZE(AO) #endif mulps %xmm11, %xmm12 @@ -2721,7 +2721,7 @@ addps %xmm11, %xmm4 movaps 24 * SIZE(BO), %xmm11 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 48) * SIZE(AO) #endif mulps %xmm11, %xmm14 @@ -2857,7 +2857,7 @@ .L122: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movaps -28 * SIZE(AO), %xmm8 @@ -2873,7 +2873,7 @@ addps %xmm8, %xmm3 movaps 0 * SIZE(AO), %xmm8 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 16) * SIZE(AO) #endif mulps %xmm10, %xmm11 @@ -3003,7 +3003,7 @@ .L132: mulps %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movsd -30 * SIZE(AO), %xmm8 @@ -3150,7 +3150,7 @@ .L142: mulss %xmm8, %xmm9 -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) PREFETCH (PREFETCHSIZE + 0) * SIZE(AO) #endif movss -31 * SIZE(AO), %xmm8 diff --git a/kernel/x86_64/gemm_ncopy_4_opteron.S b/kernel/x86_64/gemm_ncopy_4_opteron.S index edde7e2c1..e5cbd62eb 100644 --- a/kernel/x86_64/gemm_ncopy_4_opteron.S +++ b/kernel/x86_64/gemm_ncopy_4_opteron.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (48 + 4) #define MOVNTQ MOVQ @@ -79,7 +79,7 @@ #define AO3 %r13 #define AO4 %rax -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch diff --git a/kernel/x86_64/gemm_tcopy_4_opteron.S b/kernel/x86_64/gemm_tcopy_4_opteron.S index 459eeb8c5..105fe3b47 100644 --- a/kernel/x86_64/gemm_tcopy_4_opteron.S +++ b/kernel/x86_64/gemm_tcopy_4_opteron.S @@ -39,7 +39,7 @@ #define ASSEMBLER #include "common.h" -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE (12 + 4) #define WPREFETCHSIZE (12 + 4) #define MOVNTQ MOVQ @@ -96,7 +96,7 @@ #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCH prefetch #else #define RPREFETCH prefetch diff --git a/kernel/x86_64/izamax_sse2.S b/kernel/x86_64/izamax_sse2.S index 4e66e5338..404608256 100644 --- a/kernel/x86_64/izamax_sse2.S +++ b/kernel/x86_64/izamax_sse2.S @@ -469,7 +469,7 @@ ALIGN_4 .L71: -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) prefetch PREFETCHSIZE * SIZE(X) #endif diff --git a/kernel/x86_64/scal_sse.S b/kernel/x86_64/scal_sse.S index 323e8b9dd..9c8dd9dc2 100644 --- a/kernel/x86_64/scal_sse.S +++ b/kernel/x86_64/scal_sse.S @@ -266,7 +266,7 @@ sarq $5, I jle .L113 -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulps -32 * SIZE(X), %xmm1 diff --git a/kernel/x86_64/scal_sse2.S b/kernel/x86_64/scal_sse2.S index 8f5612081..3823b1fc9 100644 --- a/kernel/x86_64/scal_sse2.S +++ b/kernel/x86_64/scal_sse2.S @@ -251,7 +251,7 @@ sarq $4, I jle .L113 -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) movaps %xmm0, %xmm1 mulpd -16 * SIZE(X), %xmm1 diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 5a123d7f6..0f1ebd564 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index 8afdc87db..9dd123c52 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 5aef6b461..93a66aaa7 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index fa1bfba85..f412b3e2f 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S index 513572ee9..552dbacdc 100644 --- a/kernel/x86_64/trsm_kernel_LN_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LN_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S index 526a78c57..7727fd591 100644 --- a/kernel/x86_64/trsm_kernel_LT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_LT_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S index e96496fd6..699364941 100644 --- a/kernel/x86_64/trsm_kernel_RT_8x4_sse.S +++ b/kernel/x86_64/trsm_kernel_RT_8x4_sse.S @@ -86,7 +86,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define movsd movlps diff --git a/kernel/x86_64/zgemm_ncopy_2.S b/kernel/x86_64/zgemm_ncopy_2.S index bf318b7ff..8876b61ff 100644 --- a/kernel/x86_64/zgemm_ncopy_2.S +++ b/kernel/x86_64/zgemm_ncopy_2.S @@ -85,7 +85,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define RPREFETCHSIZE 32 #define WPREFETCHSIZE 48 #endif diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 6af65a4ba..dcfe83189 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -160,7 +160,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 71aca0198..04605e3cb 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -167,7 +167,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 4b8422d82..e8b01ad7a 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -166,7 +166,7 @@ #define xt1 %xmm14 #define xt2 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 33667f79e..40246e52e 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) @@ -166,7 +166,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BULLDOZER) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S index b8caa9a44..79f20b641 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S index 2db8cbc5d..f5c100ec1 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S index 16c9ca828..18edeed57 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S index dbdbfe2e1..f58cecdf5 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S index 181cdd29c..1b589e0cf 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S index c28d02927..2c47ce3fd 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/l1param.h b/l1param.h index 6fe756f17..0b216c7c5 100644 --- a/l1param.h +++ b/l1param.h @@ -74,6 +74,13 @@ #define ALIGNED_ACCESS #endif +#ifdef BULLDOZER +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/l2param.h b/l2param.h index cdbd8805e..01fe7943d 100644 --- a/l2param.h +++ b/l2param.h @@ -85,7 +85,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) || defined(BULLDOZER) #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps From f19af5ecc05080cd8de729490347e796b6a7af89 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 7 Dec 2012 00:58:03 +0800 Subject: [PATCH 04/30] Refs #54. Added AMD Bulldozer x86_64 dgemm kernel developed by Werner Saar Based on the dgemm kernel for AMD Barcelona, he used AVX and FMA4 instructions. Thank Werner Saar! --- kernel/x86_64/KERNEL.BULLDOZER | 2 +- kernel/x86_64/dgemm_kernel_4x4_bulldozer.S | 1860 ++++++++++++++++++++ 2 files changed, 1861 insertions(+), 1 deletion(-) create mode 100644 kernel/x86_64/dgemm_kernel_4x4_bulldozer.S diff --git a/kernel/x86_64/KERNEL.BULLDOZER b/kernel/x86_64/KERNEL.BULLDOZER index 051a52286..d59668519 100644 --- a/kernel/x86_64/KERNEL.BULLDOZER +++ b/kernel/x86_64/KERNEL.BULLDOZER @@ -10,7 +10,7 @@ SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) -DGEMMKERNEL = gemm_kernel_4x4_barcelona.S +DGEMMKERNEL = dgemm_kernel_4x4_bulldozer.S DGEMMINCOPY = DGEMMITCOPY = DGEMMONCOPY = gemm_ncopy_4_opteron.S diff --git a/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S new file mode 100644 index 000000000..b06b07edf --- /dev/null +++ b/kernel/x86_64/dgemm_kernel_4x4_bulldozer.S @@ -0,0 +1,1860 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define ASSEMBLER +#include "common.h" + +#define OLD_M %rdi +#define OLD_N %rsi +#define M %r13 +#define N %r14 +#define K %rdx + +#define A %rcx +#define B %r8 +#define C %r9 +#define LDC %r10 + +#define I %r11 +#define AO %rdi +#define BO %rsi +#define CO1 %r15 +#define CO2 %r12 +#define BB %rbp +#define J %rbx + +#ifndef WINDOWS_ABI + +#define STACKSIZE 96 + +#define ALPHA 48(%rsp) +#define OFFSET 56(%rsp) +#define KK 64(%rsp) +#define KKK 72(%rsp) + +#else + +#define STACKSIZE 256 + +#define OLD_A 40 + STACKSIZE(%rsp) +#define OLD_B 48 + STACKSIZE(%rsp) +#define OLD_C 56 + STACKSIZE(%rsp) +#define OLD_LDC 64 + STACKSIZE(%rsp) +#define OLD_OFFSET 72 + STACKSIZE(%rsp) + +#define ALPHA 224(%rsp) +#define OFFSET 232(%rsp) +#define KK 240(%rsp) +#define KKK 248(%rsp) + +#endif + +#define movapd movaps +#define movupd movups + +#define KERNEL1(xx) \ + vfmaddpd %xmm8,%xmm1,%xmm0,%xmm8 ;\ + vmovaps %xmm2, %xmm0 ;\ + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm9,%xmm3,%xmm0,%xmm9 ;\ + vfmaddpd %xmm12,%xmm2,%xmm1,%xmm12 ;\ + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm13,%xmm2,%xmm3,%xmm13 ;\ + vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1,%xmm0,%xmm10 ;\ + vfmaddpd %xmm11,%xmm3,%xmm0,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 ;\ + vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm0, %xmm2 + +#define KERNEL2(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\ + vmovaps %xmm2, %xmm0 ;\ + vmovups -10 * SIZE(AO, %rax, 4),%xmm2 ;\ +/*A*/ vmovups (AO, %rax, 4), %xmm6 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\ + vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -9 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ +/**/ vmovddup (BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup -7 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL3(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4, %xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -6 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -5 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4, %xmm11 ;\ + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 ;\ + vmovddup -4 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\ + vmovddup -3 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL4(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5 ,%xmm12;\ +/*A*/ vmovups 8 * SIZE(AO, %rax, 4), %xmm7 ;\ + vmovddup -2 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -1 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ +/**/ vmovddup 8 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 1 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm6, %xmm2 + +#define KERNEL5(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\ + vmovaps %xmm2, %xmm6 ;\ + vmovups 2 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vmovddup 2 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 3 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\ + vmovups 4 * SIZE(AO, %rax, 4), %xmm6 ;\ + vmovddup 4 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 5 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm6, %xmm2 + +#define KERNEL6(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm6,%xmm8 ;\ + vmovaps %xmm2, %xmm6 ;\ + vmovups 6 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ +/*A*/ vmovups 16 * SIZE(AO, %rax, 4), %xmm0 ;\ + vmovddup 6 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm9,%xmm3, %xmm6,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 7 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm6,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ +/**/ vmovddup 16 * SIZE(BO, %rax, 4), %xmm1 ;\ + vfmaddpd %xmm11,%xmm3, %xmm6,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2,%xmm3,%xmm15 ;\ + vmovddup 9 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm7, %xmm2 + +#define KERNEL7(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\ + vmovaps %xmm2, %xmm7 ;\ + vmovups 10 * SIZE(AO, %rax, 4),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vmovddup 10 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup 11 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\ + vmovups 12 * SIZE(AO, %rax, 4), %xmm7 ;\ + vmovddup 12 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 13 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm7, %xmm2 + +#define KERNEL8(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm7,%xmm8 ;\ + vmovaps %xmm2, %xmm7 ;\ + vmovups 14 * SIZE(AO, %rax, 4),%xmm2 ;\ +/*A*/ vmovups 24 * SIZE(AO, %rax, 4), %xmm4 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm7,%xmm9 ;\ + vmovddup 14 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ + vmovddup 15 * SIZE(BO, %rax, 4), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm7,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ +/**/ vmovddup 24 * SIZE(BO, %rax, 4), %xmm5 ;\ + vfmaddpd %xmm11,%xmm3, %xmm7,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 17 * SIZE(BO, %rax, 4), %xmm3 ;\ + vmovaps %xmm0, %xmm2 ;\ + addq $8 * SIZE, %rax ;\ + +#define KERNEL_SUB1(xx) \ + vfmaddpd %xmm8, %xmm1, %xmm0,%xmm8 ;\ + vmovapd %xmm2, %xmm0 ;\ + vmovups -14 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12, %xmm2, %xmm1,%xmm12 ;\ + vmovddup -14 * SIZE(BO), %xmm1 ;\ + vfmaddpd %xmm9, %xmm3, %xmm0,%xmm9 ;\ + vfmaddpd %xmm13, %xmm2, %xmm3,%xmm13 ;\ + vmovddup -13 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10, %xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1 ,%xmm14 ;\ + vfmaddpd %xmm11, %xmm3, %xmm0,%xmm11 ;\ + vfmaddpd %xmm15, %xmm2, %xmm3,%xmm15 ;\ + vmovups -12 * SIZE(AO), %xmm0 ;\ + vmovddup -12 * SIZE(BO), %xmm1 ;\ + vmovddup -11 * SIZE(BO), %xmm3 ;\ + vmovapd %xmm0, %xmm2 + + +#define KERNEL_SUB2(xx) \ + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 ;\ + vmovaps %xmm2, %xmm0 ;\ + vmovups -10 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -10 * SIZE(BO), %xmm1 ;\ + vmovddup -9 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovups (AO), %xmm0 ;\ + vmovddup (BO), %xmm1 ;\ + vmovddup -7 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL_SUB3(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -6 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -6 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -5 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vmovups -4 * SIZE(AO), %xmm4 ;\ + vmovddup -4 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup -3 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm4, %xmm2 + +#define KERNEL_SUB4(xx) \ + vfmaddpd %xmm8,%xmm5, %xmm4,%xmm8 ;\ + vmovaps %xmm2, %xmm4 ;\ + vmovups -2 * SIZE(AO),%xmm2 ;\ + vfmaddpd %xmm12,%xmm2, %xmm5,%xmm12 ;\ + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 ;\ + vmovddup -2 * SIZE(BO), %xmm5 ;\ + vfmaddpd %xmm9,%xmm3, %xmm4,%xmm9 ;\ + vmovddup -1 * SIZE(BO), %xmm3 ;\ + vfmaddpd %xmm10,%xmm5, %xmm4,%xmm10 ;\ + vfmaddpd %xmm14,%xmm2, %xmm5,%xmm14 ;\ + vfmaddpd %xmm11,%xmm3, %xmm4,%xmm11 ;\ + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 ;\ + vmovddup 1 * SIZE(BO), %xmm3 ;\ + vmovaps %xmm0, %xmm2 + + PROLOGUE + PROFCODE + + subq $STACKSIZE, %rsp + movq %rbx, (%rsp) + movq %rbp, 8(%rsp) + movq %r12, 16(%rsp) + movq %r13, 24(%rsp) + movq %r14, 32(%rsp) + movq %r15, 40(%rsp) + + vzeroupper + +#ifdef WINDOWS_ABI + movq %rdi, 48(%rsp) + movq %rsi, 56(%rsp) + movups %xmm6, 64(%rsp) + movups %xmm7, 80(%rsp) + movups %xmm8, 96(%rsp) + movups %xmm9, 112(%rsp) + movups %xmm10, 128(%rsp) + movups %xmm11, 144(%rsp) + movups %xmm12, 160(%rsp) + movups %xmm13, 176(%rsp) + movups %xmm14, 192(%rsp) + movups %xmm15, 208(%rsp) + + movq ARG1, OLD_M + movq ARG2, OLD_N + movq ARG3, K + movq OLD_A, A + movq OLD_B, B + movq OLD_C, C + movq OLD_LDC, LDC +#ifdef TRMMKERNEL + movsd OLD_OFFSET, %xmm12 +#endif + vmovaps %xmm3, %xmm0 + +#else + movq STACKSIZE + 8(%rsp), LDC +#ifdef TRMMKERNEL + movsd STACKSIZE + 16(%rsp), %xmm12 +#endif + +#endif + + movq OLD_M, M + movq OLD_N, N + + subq $-16 * SIZE, A + subq $-16 * SIZE, B + + vmovsd %xmm0, ALPHA + + salq $BASE_SHIFT, LDC + +#ifdef TRMMKERNEL + vmovsd %xmm12, OFFSET + vmovsd %xmm12, KK +#ifndef LEFT + negq KK +#endif +#endif + movq N, J + sarq $2, J # j = (n >> 2) + jle .L40 + ALIGN_4 + +.L01: + movq C, CO1 # coffset1 = c + leaq (C, LDC, 2), CO2 # coffset2 = c + ldc + + leaq (C, LDC, 4), C # c += 4 * ldc + +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 2, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L20 + ALIGN_4 + + .align 16 +.L11: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 4), BO +#endif + + vzeroall + prefetcht0 256(CO1) + prefetcht0 320(CO1) + prefetcht0 256(CO2) + prefetcht0 320(CO2) + vmovups -16 * SIZE(AO), %xmm0 + vmovddup -16 * SIZE(BO), %xmm1 + vmovddup -15 * SIZE(BO), %xmm3 + vmovups -8 * SIZE(AO), %xmm4 + vmovddup -8 * SIZE(BO), %xmm5 + + vmovaps %xmm0, %xmm2 + + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-8, %rax + salq $BASE_SHIFT, %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L15 + ALIGN_4 + + .align 16 +.L12: + prefetcht0 (AO,%rax,4) + prefetcht0 (BO,%rax,4) + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + NOBRANCH + je .L15 + KERNEL1(16 * 0) + KERNEL2(16 * 0) + KERNEL3(16 * 0) + KERNEL4(16 * 0) + KERNEL5(16 * 0) + KERNEL6(16 * 0) + KERNEL7(16 * 0) + KERNEL8(16 * 0) + jl .L12 + ALIGN_4 + +.L15: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + testq $4, %rax + je .L16 + ALIGN_4 + + KERNEL_SUB1(16 * 0) + KERNEL_SUB2(16 * 0) + KERNEL_SUB3(16 * 0) + KERNEL_SUB4(16 * 0) + + subq $-16 * SIZE, BO + subq $-16 * SIZE, AO + ALIGN_4 + +.L16: +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L19 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L17: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vmovaps %xmm2, %xmm0 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm9,%xmm3, %xmm0,%xmm9 + vfmaddpd %xmm13,%xmm2, %xmm3,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm3 + vfmaddpd %xmm10,%xmm1, %xmm0,%xmm10 + vfmaddpd %xmm14,%xmm2, %xmm1,%xmm14 + vfmaddpd %xmm11,%xmm3, %xmm0,%xmm11 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm15,%xmm2, %xmm3,%xmm15 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm3 + vmovaps %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L17 + ALIGN_4 + +.L19: + // prefetch -8 * SIZE(BB) + subq $-16 * SIZE, BB + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12,%xmm12 + .align 2 + vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9 + vfmaddpd 2 * SIZE(CO1, LDC),%xmm7, %xmm13,%xmm13 + .align 2 + vfmaddpd (CO2),%xmm7, %xmm10,%xmm10 + vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm14,%xmm14 + .align 2 + vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11 + vfmaddpd 2 * SIZE(CO2, LDC),%xmm7, %xmm15,%xmm15 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm12,%xmm12 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm13,%xmm13 + vmulpd %xmm7, %xmm10,%xmm10 + vmulpd %xmm7, %xmm14,%xmm14 + vmulpd %xmm7, %xmm11,%xmm11 + vmulpd %xmm7, %xmm15,%xmm15 + +#endif + + .align 2 + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + .align 2 + vmovups %xmm9, (CO1, LDC) + vmovups %xmm13, 2 * SIZE(CO1, LDC) + .align 2 + vmovups %xmm10, (CO2) + vmovups %xmm14, 2 * SIZE(CO2) + .align 2 + vmovups %xmm11, (CO2, LDC) + vmovups %xmm15, 2 * SIZE(CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + BRANCH + jg .L11 + ALIGN_4 + +.L20: + testq $3, M + je .L39 + + testq $2, M + je .L30 + ALIGN_4 + +.L21: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 4), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8, %xmm8 + vmovups -12 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9 ,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm10, %xmm10, %xmm10 + vmovddup -15 * SIZE(BO), %xmm5 + vxorps %xmm11, %xmm11, %xmm11 + vmovddup -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L26 + ALIGN_4 + +.L22: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -10 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -9 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovddup (BO, %rax, 4), %xmm1 + vmovddup -7 * SIZE(BO, %rax, 4), %xmm5 + vmovups -8 * SIZE(AO, %rax, 2), %xmm0 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9 + vmovddup -6 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -5 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11 + vmovups -10 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -4 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -3 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,%xmm2, %xmm5,%xmm9 + vmovddup -2 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -1 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,%xmm2, %xmm5,%xmm11 + vmovddup 8 * SIZE(BO, %rax, 4), %xmm3 + vmovups -4 * SIZE(AO, %rax, 2), %xmm2 + vmovddup 1 * SIZE(BO, %rax, 4), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L22 + ALIGN_4 + +.L26: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L29 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L27: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm5,%xmm9 + vmovddup -13 * SIZE(BO, %rax, 4), %xmm5 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -12 * SIZE(BO, %rax, 4), %xmm1 + vfmaddpd %xmm11,%xmm0, %xmm5,%xmm11 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -11 * SIZE(BO, %rax, 4), %xmm5 + + addq $SIZE, %rax + jl .L27 + ALIGN_4 + +.L29: +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd (CO1, LDC),%xmm7, %xmm9,%xmm9 + vfmaddpd (CO2),%xmm7, %xmm10,%xmm10 + vfmaddpd (CO2, LDC),%xmm7, %xmm11,%xmm11 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm10,%xmm10 + vmulpd %xmm7, %xmm11,%xmm11 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm9, (CO1, LDC) + + vmovups %xmm10, (CO2) + vmovups %xmm11, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 + addq $2 * SIZE, CO2 + ALIGN_4 + +.L30: + testq $1, M + je .L39 + +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 4), BO +#endif + + vmovddup -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8, %xmm8 + vmovddup -14 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9, %xmm9 + vmovddup -15 * SIZE(AO), %xmm4 + vxorps %xmm10, %xmm10,%xmm10 + vmovups -16 * SIZE(BO), %xmm1 + vxorps %xmm11, %xmm11,%xmm11 + vmovups -8 * SIZE(BO), %xmm3 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $4, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + NOBRANCH + je .L36 + ALIGN_4 + +.L32: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9 + vmovups -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -12 * SIZE(AO, %rax, 1), %xmm0 + vfmaddpd %xmm10,%xmm4, %xmm1,%xmm10 + vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 4), %xmm4,%xmm11 + vmovups (BO, %rax, 4), %xmm1 + vmovddup -11 * SIZE(AO, %rax, 1), %xmm4 + vfmaddpd %xmm8,%xmm2, %xmm3,%xmm8 + vfmaddpd %xmm9,-6 * SIZE(BO, %rax, 4), %xmm2,%xmm9 + vmovups -4 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -13 * SIZE(AO, %rax, 1), %xmm2 + vfmaddpd %xmm10,%xmm2, %xmm3,%xmm10 + vfmaddpd %xmm11,-2 * SIZE(BO, %rax, 4), %xmm2,%xmm11 + vmovups 8 * SIZE(BO, %rax, 4), %xmm3 + vmovddup -10 * SIZE(AO, %rax, 1), %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L32 + ALIGN_4 + +.L36: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L38 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO + negq %rax + ALIGN_4 + +.L37: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 4), %xmm0,%xmm9 + vmovups -12 * SIZE(BO, %rax, 4), %xmm1 + vmovddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L37 + ALIGN_4 + +.L38: + vaddpd %xmm10, %xmm8,%xmm8 + vaddpd %xmm11, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 + vmovhpd (CO1, LDC), %xmm0,%xmm0 + vmovsd (CO2), %xmm1 + vmovhpd (CO2, LDC), %xmm1,%xmm1 + + + vfmaddpd %xmm0, %xmm7,%xmm8,%xmm8 + vfmaddpd %xmm1, %xmm7,%xmm9,%xmm9 +#else + + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + +#endif + + vmovsd %xmm8, (CO1) + vmovhpd %xmm8, (CO1, LDC) + vmovsd %xmm9, (CO2) + vmovhpd %xmm9, (CO2, LDC) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 4), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L39: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $4, KK +#endif + + movq BO, B + + decq J # j -- + jg .L01 + ALIGN_4 + +.L40: + testq $3, N + je .L999 + + testq $2, N + je .L80 + ALIGN_4 + +.L41: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + leaq (C, LDC, 1), CO2 # coffset2 = c + ldc + movq A, AO # aoffset = a + + movq K, %rax + salq $BASE_SHIFT + 1, %rax + leaq (B, %rax), BB + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L60 + ALIGN_4 + +.L51: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 2), BO +#endif + + vmovddup -16 * SIZE(BO), %xmm1 + vmovddup -15 * SIZE(BO), %xmm5 + vmovddup -12 * SIZE(BO), %xmm3 + vxorps %xmm8, %xmm8,%xmm8 + vxorps %xmm9, %xmm9,%xmm9 + vxorps %xmm12, %xmm12,%xmm12 + vxorps %xmm13, %xmm13,%xmm13 + vmovups -16 * SIZE(AO), %xmm0 + vmovups -8 * SIZE(AO), %xmm4 + // prefetcht0 256(CO1) + // prefetcht0 320(CO1) + // prefetcht0 256(CO2) + // prefetcht0 320(CO2) + // prefetchnta 24 * SIZE(CO1) + // prefetchnta 32 * SIZE(CO1) + // prefetchw 3 * SIZE(CO1) + vmovups %xmm0, %xmm2 + // prefetchw 3 * SIZE(CO2) + // prefetchnta -16 * SIZE(BB) + // prefetch -16 * SIZE(BB) + subq $-8 * SIZE, BB + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L56 + ALIGN_4 + +.L52: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 + vmovups -10 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 + vmovups (AO, %rax, 4), %xmm0 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -11 * SIZE(BO, %rax, 2), %xmm5 + vmovups -6 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 + vmovups -4 * SIZE(AO, %rax, 4), %xmm4 + vmovddup -10 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -9 * SIZE(BO, %rax, 2), %xmm5 + vmovups -2 * SIZE(AO, %rax, 4), %xmm2 + vfmaddpd %xmm8,%xmm3, %xmm4,%xmm8 + vfmaddpd %xmm12,%xmm2, %xmm3,%xmm12 + vfmaddpd %xmm9,%xmm5, %xmm4,%xmm9 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovups 8 * SIZE(AO, %rax, 4), %xmm4 + vmovddup -4 * SIZE(BO, %rax, 2), %xmm3 + vmovddup -7 * SIZE(BO, %rax, 2), %xmm5 + vmovaps %xmm0, %xmm2 + + addq $4 * SIZE, %rax + BRANCH + jl .L52 + ALIGN_4 + +.L56: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L59 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L57: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vmovups -14 * SIZE(AO, %rax, 4),%xmm2 + vfmaddpd %xmm12,%xmm2, %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm13,%xmm2, %xmm5,%xmm13 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm5 + vmovaps %xmm0, %xmm2 + + addq $SIZE, %rax + jl .L57 + ALIGN_4 + +.L59: +#ifndef TRMMKERNEL + vfmaddpd (CO1),%xmm7, %xmm8, %xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7, %xmm12, %xmm12 + vfmaddpd (CO2),%xmm7, %xmm9, %xmm9 + vfmaddpd 2 * SIZE(CO2),%xmm7, %xmm13, %xmm13 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + vmulpd %xmm7, %xmm12,%xmm12 + vmulpd %xmm7, %xmm13,%xmm13 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + + vmovups %xmm9, (CO2) + vmovups %xmm13, 2 * SIZE(CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + addq $4 * SIZE, CO2 # coffset += 4 + decq I # i -- + jg .L51 + ALIGN_4 + +.L60: + testq $2, M + je .L70 + ALIGN_4 + +.L61: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 2), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovups -12 * SIZE(AO), %xmm2 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -15 * SIZE(BO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L66 + ALIGN_4 + +.L62: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm10,%xmm0, %xmm1,%xmm10 + vmovddup -12 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm11,%xmm0, %xmm3,%xmm11 + vmovups -8 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -11 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm8,%xmm2, %xmm1,%xmm8 + vmovddup -10 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm2, %xmm3,%xmm9 + vmovups -10 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -9 * SIZE(BO, %rax, 2), %xmm3 + vfmaddpd %xmm10,%xmm2, %xmm1,%xmm10 + vmovddup -8 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm11,%xmm2, %xmm3,%xmm11 + vmovups -4 * SIZE(AO, %rax, 2), %xmm2 + vmovddup -7 * SIZE(BO, %rax, 2), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L62 + ALIGN_4 + +.L66: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L69 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L67: + vfmaddpd %xmm8,%xmm0, %xmm1,%xmm8 + vmovddup -14 * SIZE(BO, %rax, 2), %xmm1 + vfmaddpd %xmm9,%xmm0, %xmm3,%xmm9 + vmovups -14 * SIZE(AO, %rax, 2), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 2), %xmm3 + + addq $SIZE, %rax + jl .L67 + ALIGN_4 + +.L69: + vaddpd %xmm10, %xmm8,%xmm8 + vaddpd %xmm11, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd (CO2),%xmm7, %xmm9,%xmm9 + +#else + + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm9,%xmm9 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm9, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + addq $2 * SIZE, CO2 # coffset += 4 + ALIGN_4 + +.L70: + testq $1, M + je .L79 + ALIGN_4 + +.L71: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 2), BO +#endif + + vmovddup -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovddup -15 * SIZE(AO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -14 * SIZE(AO), %xmm2 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -13 * SIZE(AO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $2, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + NOBRANCH + je .L76 + ALIGN_4 + +.L72: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8 + vmovddup -12 * SIZE(AO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 2), %xmm1,%xmm9 + vmovddup -11 * SIZE(AO, %rax, 1), %xmm1 + + vfmaddpd %xmm10,-12 * SIZE(BO, %rax, 2), %xmm2,%xmm10 + vmovddup -10 * SIZE(AO, %rax, 1), %xmm2 + + vfmaddpd %xmm11,-10 * SIZE(BO, %rax, 2), %xmm3,%xmm11 + vmovddup -9 * SIZE(AO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L72 + ALIGN_4 + +.L76: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L78 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO + negq %rax + ALIGN_4 + +.L77: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 2), %xmm0,%xmm8 + vmovddup -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L77 + ALIGN_4 + +.L78: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm11, %xmm10,%xmm10 + vaddpd %xmm10, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 + vmovhpd (CO2), %xmm0,%xmm0 +#endif + + vmulpd %xmm7, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vaddpd %xmm0, %xmm8,%xmm8 +#endif + + vmovsd %xmm8, (CO1) + vmovhpd %xmm8, (CO2) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 2), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $1, KK +#endif + ALIGN_4 + +.L79: +#if defined(TRMMKERNEL) && !defined(LEFT) + addq $2, KK +#endif + + movq BO, B + + leaq (C, LDC, 2), C + ALIGN_4 + +.L80: + testq $1, N + je .L999 + ALIGN_4 + +.L81: +#if defined(TRMMKERNEL) && defined(LEFT) + movq OFFSET, %rax + movq %rax, KK +#endif + + movq C, CO1 # coffset1 = c + movq A, AO # aoffset = a + + movq M, I + sarq $2, I # i = (m >> 2) + jle .L100 + ALIGN_4 + +.L91: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (B, %rax, 1), BO +#endif + + vmovups -8 * SIZE(AO), %xmm2 + vxorps %xmm8, %xmm8,%xmm8 + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -16 * SIZE(BO), %xmm1 + vxorps %xmm12, %xmm12,%xmm12 + vmovddup -14 * SIZE(BO), %xmm3 + vxorps %xmm13, %xmm13,%xmm13 + vmovddup -15 * SIZE(BO), %xmm5 + + // prefetchw 3 * SIZE(CO1) + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $4, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L96 + ALIGN_4 + +.L92: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12 + vmovapd -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -12 * SIZE(BO, %rax, 1), %xmm1 + vfmaddpd %xmm9,%xmm5, %xmm0,%xmm9 + vfmaddpd %xmm13,-10 * SIZE(AO, %rax, 4), %xmm5,%xmm13 + vmovapd (AO, %rax, 4), %xmm0 + vmovddup -13 * SIZE(BO, %rax, 1), %xmm5 + vfmaddpd %xmm8,%xmm3, %xmm2,%xmm8 + vfmaddpd %xmm12,-6 * SIZE(AO, %rax, 4), %xmm3,%xmm12 + vmovapd -4 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -10 * SIZE(BO, %rax, 1), %xmm3 + vfmaddpd %xmm9,%xmm5, %xmm2,%xmm9 + vfmaddpd %xmm13,-2 * SIZE(AO, %rax, 4), %xmm5,%xmm13 + vmovapd 8 * SIZE(AO, %rax, 4), %xmm2 + vmovddup -11 * SIZE(BO, %rax, 1), %xmm5 + + addq $4 * SIZE, %rax + BRANCH + jl .L92 + ALIGN_4 + +.L96: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L99 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L97: + vfmaddpd %xmm8,%xmm1, %xmm0,%xmm8 + vfmaddpd %xmm12,-14 * SIZE(AO, %rax, 4), %xmm1,%xmm12 + vmovups -12 * SIZE(AO, %rax, 4), %xmm0 + vmovddup -15 * SIZE(BO, %rax, 1), %xmm1 + + addq $SIZE, %rax + jl .L97 + ALIGN_4 + +.L99: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm13, %xmm12,%xmm12 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 + vfmaddpd 2 * SIZE(CO1),%xmm7,%xmm12,%xmm12 + +#else + vmulpd %xmm7, %xmm8,%xmm8 + vmulpd %xmm7, %xmm12,%xmm12 + +#endif + + vmovups %xmm8, (CO1) + vmovups %xmm12, 2 * SIZE(CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 4), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $4, KK +#endif + + addq $4 * SIZE, CO1 # coffset += 4 + decq I # i -- + jg .L91 + ALIGN_4 + +.L100: + testq $2, M + je .L110 + ALIGN_4 + +.L101: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (B, %rax, 1), BO +#endif + + vmovddup -16 * SIZE(BO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + vmovddup -15 * SIZE(BO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + vmovddup -14 * SIZE(BO), %xmm2 + vxorps %xmm10, %xmm10,%xmm10 + vmovddup -13 * SIZE(BO), %xmm3 + vxorps %xmm11, %xmm11,%xmm11 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $2, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L106 + ALIGN_4 + +.L102: + vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8 + vmovddup -12 * SIZE(BO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(AO, %rax, 2), %xmm1,%xmm9 + vmovddup -11 * SIZE(BO, %rax, 1), %xmm1 + + vfmaddpd %xmm10,-12 * SIZE(AO, %rax, 2), %xmm2,%xmm10 + vmovddup -10 * SIZE(BO, %rax, 1), %xmm2 + + vfmaddpd %xmm11,-10 * SIZE(AO, %rax, 2), %xmm3,%xmm11 + vmovddup -9 * SIZE(BO, %rax, 1), %xmm3 + + addq $4 * SIZE, %rax + BRANCH + jl .L102 + ALIGN_4 + +.L106: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L109 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L107: + vmovddup -16 * SIZE(BO, %rax, 1), %xmm0 + vfmaddpd %xmm8,-16 * SIZE(AO, %rax, 2), %xmm0,%xmm8 + + addq $SIZE, %rax + jl .L107 + ALIGN_4 + +.L109: + vaddpd %xmm9, %xmm8,%xmm8 + vaddpd %xmm11, %xmm10,%xmm10 + vaddpd %xmm10, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + + vfmaddpd (CO1),%xmm7, %xmm8,%xmm8 +#else + vmulpd %xmm7, %xmm8,%xmm8 + +#endif + + vmovups %xmm8, (CO1) + +#if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq K, %rax + subq KKK, %rax + leaq (,%rax, SIZE), %rax + leaq (AO, %rax, 2), AO + leaq (BO, %rax, 1), BO +#endif + +#if defined(TRMMKERNEL) && defined(LEFT) + addq $2, KK +#endif + + addq $2 * SIZE, CO1 # coffset += 4 + + ALIGN_4 + +.L110: + testq $1, M + je .L999 + ALIGN_4 + +.L111: +#if !defined(TRMMKERNEL) || \ + (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || \ + (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) + movq B, BO +#else + movq KK, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (B, %rax, 1), BO +#endif + + vmovups -16 * SIZE(AO), %xmm0 + vxorps %xmm8, %xmm8,%xmm8 + movups -14 * SIZE(AO), %xmm1 + vxorps %xmm9, %xmm9,%xmm9 + +#ifndef TRMMKERNEL + movq K, %rax +#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) + movq K, %rax + subq KK, %rax + movq %rax, KKK +#else + movq KK, %rax +#ifdef LEFT + addq $1, %rax +#else + addq $1, %rax +#endif + movq %rax, KKK +#endif + + andq $-4, %rax + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + NOBRANCH + je .L116 + ALIGN_4 + +.L112: + vfmaddpd %xmm8,-16 * SIZE(BO, %rax, 1), %xmm0,%xmm8 + vmovups -12 * SIZE(AO, %rax, 1), %xmm0 + + vfmaddpd %xmm9,-14 * SIZE(BO, %rax, 1), %xmm1,%xmm9 + vmovups -10 * SIZE(AO, %rax, 1), %xmm1 + + addq $4 * SIZE, %rax + BRANCH + jl .L112 + ALIGN_4 + +.L116: + vmovddup ALPHA, %xmm7 + +#ifndef TRMMKERNEL + movq K, %rax +#else + movq KKK, %rax +#endif + andq $3, %rax # if (k & 1) + je .L118 + + leaq (, %rax, SIZE), %rax + leaq (AO, %rax, 1), AO + leaq (BO, %rax, 1), BO + negq %rax + ALIGN_4 + +.L117: + vmulsd -16 * SIZE(BO, %rax, 1), %xmm0,%xmm0 + vaddsd %xmm0, %xmm8,%xmm8 + vmovsd -15 * SIZE(AO, %rax, 1), %xmm0 + + addq $SIZE, %rax + jl .L117 + ALIGN_4 + +.L118: + vaddpd %xmm9, %xmm8,%xmm8 + vhaddpd %xmm8, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vmovsd (CO1), %xmm0 +#endif + + vmulsd %xmm7, %xmm8,%xmm8 + +#ifndef TRMMKERNEL + vaddsd %xmm0, %xmm8,%xmm8 +#endif + + vmovsd %xmm8, (CO1) + ALIGN_4 + +.L999: + movq (%rsp), %rbx + movq 8(%rsp), %rbp + movq 16(%rsp), %r12 + movq 24(%rsp), %r13 + movq 32(%rsp), %r14 + movq 40(%rsp), %r15 + +#ifdef WINDOWS_ABI + movq 48(%rsp), %rdi + movq 56(%rsp), %rsi + movups 64(%rsp), %xmm6 + movups 80(%rsp), %xmm7 + movups 96(%rsp), %xmm8 + movups 112(%rsp), %xmm9 + movups 128(%rsp), %xmm10 + movups 144(%rsp), %xmm11 + movups 160(%rsp), %xmm12 + movups 176(%rsp), %xmm13 + movups 192(%rsp), %xmm14 + movups 208(%rsp), %xmm15 +#endif + + addq $STACKSIZE, %rsp + ret + + EPILOGUE From bb10cb8442f84e76d1140f58b37fa1edc3393972 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 10 Dec 2012 11:51:39 +0800 Subject: [PATCH 05/30] Refs #165. fall back of DTB_DEFAULT_ENTRIES for some virtual machines. --- cpuid_x86.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index 6e4eae20d..2ffc5f1d5 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1494,6 +1494,9 @@ void get_cpuconfig(void){ printf("#define DTB_SIZE %d\n", info.size * 1024); printf("#define DTB_ASSOCIATIVE %d\n", info.associative); printf("#define DTB_DEFAULT_ENTRIES %d\n", info.linesize); + } else { + //fall back for some virtual machines. + printf("#define DTB_DEFAULT_ENTRIES 32\n"); } features = get_cputype(GET_FEATURE); From bdf8d9411e2e3698c7462a46e813c148c0e8aa98 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Mon, 10 Dec 2012 15:49:01 +0800 Subject: [PATCH 06/30] Refs #163. Obtain the build configure on runtime. openblas_get_config function returns the configure string. So far, it supports USE64BITINT, NO_CBLAS, NO_LAPACK, NO_LAPACKE, DYNAMIC_ARCH, NO_AFFINITY. Example: #include extern char * openblas_get_config(); void main() { printf("%s\n",openblas_get_config()); return; } --- cblas.h | 3 ++ driver/others/Makefile | 5 ++- driver/others/openblas_get_config.c | 59 +++++++++++++++++++++++++++++ exports/gensymbol | 1 + 4 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 driver/others/openblas_get_config.c diff --git a/cblas.h b/cblas.h index ee8bf08b2..e9664fe79 100644 --- a/cblas.h +++ b/cblas.h @@ -13,6 +13,9 @@ extern "C" { void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); +/*Get the build configure on runtime.*/ +char* openblas_get_config(void); + #define CBLAS_INDEX size_t enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; diff --git a/driver/others/Makefile b/driver/others/Makefile index a1c7a504e..c449ec6c6 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,7 +1,7 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) openblas_get_config.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) @@ -103,6 +103,9 @@ blas_server.$(SUFFIX) : $(BLAS_SERVER) ../../common.h ../../common_thread.h ../. openblas_set_num_threads.$(SUFFIX) : openblas_set_num_threads.c $(CC) $(CFLAGS) -c $< -o $(@F) +openblas_get_config.$(SUFFIX) : openblas_get_config.c + $(CC) $(CFLAGS) -c $< -o $(@F) + blasL1thread.$(SUFFIX) : blas_l1_thread.c ../../common.h ../../common_thread.h $(CC) $(CFLAGS) -c $< -o $(@F) diff --git a/driver/others/openblas_get_config.c b/driver/others/openblas_get_config.c new file mode 100644 index 000000000..581ab1a43 --- /dev/null +++ b/driver/others/openblas_get_config.c @@ -0,0 +1,59 @@ +/***************************************************************************** +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the ISCAS nor the names of its contributors may + be used to endorse or promote products derived from this software + without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include "common.h" + +static char* openblas_config_str="" +#ifdef USE64BITINT + "USE64BITINT " +#endif +#ifdef NO_CBLAS + "NO_CBLAS " +#endif +#ifdef NO_LAPACK + "NO_LAPACK " +#endif +#ifdef NO_LAPACKE + "NO_LAPACKE " +#endif +#ifdef DYNAMIC_ARCH + "DYNAMIC_ARCH " +#endif +#ifdef NO_AFFINITY + "NO_AFFINITY " +#endif + ; + +char* CNAME() { + return openblas_config_str; +} + diff --git a/exports/gensymbol b/exports/gensymbol index c492eefb5..04cbd7d84 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -74,6 +74,7 @@ @misc_no_underscore_objs = ( openblas_set_num_threads, goto_set_num_threads, + openblas_get_config, ); @misc_underscore_objs = ( From 13f8fc0b1a4f843f0b77913caeb791cb15a3ae3c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 11 Dec 2012 10:55:10 +0100 Subject: [PATCH 07/30] Write FMA4 flag to the configure file. --- cpuid_x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/cpuid_x86.c b/cpuid_x86.c index afc3b17b7..385114619 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1525,6 +1525,7 @@ void get_cpuconfig(void){ if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); + if (features & HAVE_FMA4 ) printf("#define HAVE_FMA4\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); if (features & HAVE_HIT) printf("#define HAVE_HIT 1\n"); if (features & HAVE_MISALIGNSSE) printf("#define HAVE_MISALIGNSSE\n"); @@ -1591,5 +1592,6 @@ void get_sse(void){ if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); + if (features & HAVE_FMA4 ) printf("HAVE_FMA4=1\n"); } From a4ee6f3915758e9272ec9f33206e86d7059bcd1e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 18 Dec 2012 08:57:46 +0800 Subject: [PATCH 08/30] Fixed #172. Support Intel Xeon E7540. --- cpuid_x86.c | 4 ++++ driver/others/dynamic.c | 3 ++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 2ffc5f1d5..b3352244b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1030,6 +1030,8 @@ int get_cpuname(void){ return CPUTYPE_SANDYBRIDGE; else return CPUTYPE_NEHALEM; + case 14: + // Xeon E7540 case 15: //Xeon Processor E7 (Westmere-EX) return CPUTYPE_NEHALEM; @@ -1398,6 +1400,8 @@ int get_coretype(void){ return CORE_SANDYBRIDGE; else return CORE_NEHALEM; //OS doesn't support AVX + case 14: + //Xeon E7540 case 15: //Xeon Processor E7 (Westmere-EX) return CORE_NEHALEM; diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 23de095ca..28fdd30d8 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -163,7 +163,8 @@ static gotoblas_t *get_coretype(void){ //Intel Xeon Processor 5600 (Westmere-EP) //Xeon Processor E7 (Westmere-EX) - if (model == 12 || model == 15) return &gotoblas_NEHALEM; + //Xeon E7540 + if (model == 12 || model == 14 || model == 15) return &gotoblas_NEHALEM; //Intel Core i5-2000 /i7-2000 (Sandy Bridge) //Intel Core i7-3000 / Xeon E5 From fd3046b32a1f7049fcb2bfb255d72e4204e5522e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 23 Dec 2012 21:47:22 +0800 Subject: [PATCH 09/30] Refs #173. Fixed overflow internal buffer bug of gemv_t on x86. --- kernel/x86/gemv_t_sse.S | 67 +++++++++++++++++++++++++++++++------ kernel/x86/gemv_t_sse2.S | 71 +++++++++++++++++++++++++++++++++------- kernel/x86_64/sgemv_t.S | 61 +++++++++++++++++++++++++--------- 3 files changed, 163 insertions(+), 36 deletions(-) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index 5bacb7da8..c72febe3d 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -89,17 +89,23 @@ #endif #define STACKSIZE 16 +#define ARGS 16 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 20 + STACKSIZE(%esp) -#define STACK_LDA 24 + STACKSIZE(%esp) -#define STACK_X 28 + STACKSIZE(%esp) -#define STACK_INCX 32 + STACKSIZE(%esp) -#define Y 36 + STACKSIZE(%esp) -#define STACK_INCY 40 + STACKSIZE(%esp) -#define BUFFER 44 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 20 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) +#define STACK_X 28 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) +#define Y 36 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) +#define BUFFER 44 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define NN 4+STACKSIZE(%esp) +#define AA 8+STACKSIZE(%esp) +#define LDAX 12+STACKSIZE(%esp) #define I %eax #define J %ebx @@ -114,6 +120,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -122,6 +129,37 @@ PROFCODE movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $23,J # J=2^22 + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA + movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -628,10 +666,19 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index c7e685dd8..d46d7e43e 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -76,18 +76,24 @@ #endif #define STACKSIZE 16 +#define ARGS 16 + +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+STACKSIZE(%esp) +#define AA 4+STACKSIZE(%esp) +#define LDAX 8+STACKSIZE(%esp) +#define NN 12+STACKSIZE(%esp) -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) - #define I %eax #define J %ebx @@ -101,6 +107,8 @@ PROLOGUE + subl $ARGS,%esp + pushl %ebp pushl %edi pushl %esi @@ -108,7 +116,38 @@ PROFCODE + movl STACK_LDA, LDA + movl LDA,LDAX # backup LDA + movl N,J + movl J,NN # backup N + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # mov M to MMM +.L0t: + xorl J,J + addl $1,J + sall $22,J # J=2^22 + subl J,MMM # MMM=MMM-J + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A # mov AA to A + + movl NN,%eax + movl %eax,N # reset N + + + movl LDAX, LDA # reset LDA movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -117,6 +156,7 @@ leal (,INCY, SIZE), INCY leal (,LDA, SIZE), LDA + subl $-16 * SIZE, A cmpl $0, N @@ -560,10 +600,19 @@ ALIGN_4 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index 052ff1a79..06970a055 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -47,7 +47,7 @@ #ifndef WINDOWS_ABI -#define STACKSIZE 64 +#define STACKSIZE 128 #define OLD_M %rdi #define OLD_N %rsi @@ -57,6 +57,10 @@ #define STACK_Y 16 + STACKSIZE(%rsp) #define STACK_INCY 24 + STACKSIZE(%rsp) #define STACK_BUFFER 32 + STACKSIZE(%rsp) +#define MMM 56(%rsp) +#define NN 64(%rsp) +#define AA 72(%rsp) +#define LDAX 80(%rsp) #else @@ -71,6 +75,10 @@ #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) +#defien MMM 216(%rsp) +#defien NN 224(%rsp) +#define AA 232(%rsp) +#define LDAX 240(%rsp) #endif @@ -127,29 +135,46 @@ movups %xmm14, 192(%rsp) movups %xmm15, 208(%rsp) - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX movq OLD_X, X #else - movq OLD_M, M - movq OLD_N, N - movq OLD_A, A - movq OLD_LDA, LDA + movq OLD_M, MMM + movq OLD_N, NN + movq OLD_A, AA + movq OLD_LDA, LDAX #endif - - movq STACK_INCX, INCX - movq STACK_Y, Y - movq STACK_INCY, INCY - movq STACK_BUFFER, BUFFER - #ifndef WINDOWS_ABI pshufd $0, %xmm0, ALPHA #else pshufd $0, %xmm3, ALPHA #endif + +.L0t: + xorq M,M + addq $1,M + salq $22,M + subq M,MMM + jge .L00t + ALIGN_4 + + movq MMM,%rax + addq M,%rax + jle .L999x + movq %rax,M + +.L00t: + movq LDAX,LDA + movq NN,N + movq AA,A + movq STACK_INCX, INCX + movq STACK_Y, Y + movq STACK_INCY, INCY + movq STACK_BUFFER, BUFFER + leaq (,INCX, SIZE), INCX leaq (,INCY, SIZE), INCY leaq (,LDA, SIZE), LDA @@ -6341,6 +6366,12 @@ ALIGN_4 .L999: + leaq (,M,SIZE),%rax + addq %rax,AA + jmp .L0t + ALIGN_4 + +.L999x: movq 0(%rsp), %rbx movq 8(%rsp), %rbp movq 16(%rsp), %r12 From 91ed4e4450ceabd71493e0bf80e7455df414bebf Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 23 Dec 2012 23:14:17 +0800 Subject: [PATCH 10/30] Refs #171. Prevent loading the dirty number from the buffer in sgemv_t x86 kernel. --- kernel/x86/gemv_t_sse.S | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index c72febe3d..42ed19998 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -236,6 +236,20 @@ jg .L06 ALIGN_4 +//Padding zero to prevent loading the dirty number from buffer. + movl M, I + movl $8, J + andl $7, I + xorps %xmm0, %xmm0 + subl I, J + ALIGN_2 +.L07: + movss %xmm0, 0 * SIZE(Y1) + addl $SIZE, Y1 + decl J + jg .L07 + ALIGN_4 + .L10: movl Y, Y1 From 0d1518add98bc3c0e83887be74cda3b23c8937ee Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 25 Dec 2012 09:10:17 +0800 Subject: [PATCH 11/30] Refs #173. Fixed overflow internal buffer bug of sgemv_t on x86 --- kernel/x86/gemv_t_sse.S | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index 42ed19998..fa6cfc50b 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -89,7 +89,7 @@ #endif #define STACKSIZE 16 -#define ARGS 16 +#define ARGS 20 #define M 4 + STACKSIZE+ARGS(%esp) #define N 8 + STACKSIZE+ARGS(%esp) @@ -106,6 +106,7 @@ #define NN 4+STACKSIZE(%esp) #define AA 8+STACKSIZE(%esp) #define LDAX 12+STACKSIZE(%esp) +#define XX 16+STACKSIZE(%esp) #define I %eax #define J %ebx @@ -130,6 +131,8 @@ movl STACK_LDA, LDA movl LDA,LDAX # backup LDA + movl STACK_X, X + movl X,XX movl N,J movl J,NN # backup N movl A,J @@ -139,7 +142,7 @@ .L0t: xorl J,J addl $1,J - sall $23,J # J=2^22 + sall $21,J # J=2^22 subl J,MMM # MMM=MMM-J movl J,M jge .L00t @@ -159,8 +162,8 @@ movl LDAX, LDA # reset LDA + movl XX,X - movl STACK_X, X movl STACK_INCX, INCX movl STACK_INCY, INCY @@ -683,6 +686,9 @@ movl M,J leal (,J,SIZE),%eax addl %eax,AA + movl XX,J + addl %eax,J + movl J,XX jmp .L0t ALIGN_4 From 69200884e13e98b79487cfd1c78faf054278ec2f Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 25 Dec 2012 09:27:49 +0800 Subject: [PATCH 12/30] Refs #173. Fixed overflow internal buffer bug of gemv_n on x86 --- kernel/x86/gemv_n_sse.S | 65 ++++++++++++++++++++++++++++++++------- kernel/x86/gemv_n_sse2.S | 66 ++++++++++++++++++++++++++++++++++------ 2 files changed, 110 insertions(+), 21 deletions(-) diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S index 0891657fa..3ff9203c8 100644 --- a/kernel/x86/gemv_n_sse.S +++ b/kernel/x86/gemv_n_sse.S @@ -89,17 +89,22 @@ #endif #define STACKSIZE 16 +#define ARGS 16 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 20 + STACKSIZE(%esp) -#define STACK_LDA 24 + STACKSIZE(%esp) -#define STACK_X 28 + STACKSIZE(%esp) -#define STACK_INCX 32 + STACKSIZE(%esp) -#define Y 36 + STACKSIZE(%esp) -#define STACK_INCY 40 + STACKSIZE(%esp) -#define BUFFER 44 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 20 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 24 + STACKSIZE+ARGS(%esp) +#define STACK_X 28 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 32 + STACKSIZE+ARGS(%esp) +#define Y 36 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 40 + STACKSIZE+ARGS(%esp) +#define BUFFER 44 + STACKSIZE+ARGS(%esp) +#define MMM 0+ARGS(%esp) +#define YY 4+ARGS(%esp) +#define AA 8+ARGS(%esp) +#define LDAX 12+ARGS(%esp) #define I %eax #define J %ebx @@ -114,6 +119,7 @@ PROLOGUE + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -121,7 +127,34 @@ PROFCODE + movl Y,J + movl J,YY # backup Y + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # backup MM +.L0t: + xorl J,J + addl $1,J + sall $21,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y movl STACK_LDA, LDA + movl STACK_X, X movl STACK_INCX, INCX @@ -651,12 +684,22 @@ addss 0 * SIZE(X), %xmm0 movss %xmm0, (Y1) ALIGN_3 - .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl YY,J + addl %eax,J + movl J,YY + jmp .L0t + ALIGN_4 + +.L999x: popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S index 5f5fa5a51..980797d91 100644 --- a/kernel/x86/gemv_n_sse2.S +++ b/kernel/x86/gemv_n_sse2.S @@ -76,17 +76,22 @@ #endif #define STACKSIZE 16 +#define ARGS 16 -#define M 4 + STACKSIZE(%esp) -#define N 8 + STACKSIZE(%esp) -#define ALPHA 16 + STACKSIZE(%esp) -#define A 24 + STACKSIZE(%esp) -#define STACK_LDA 28 + STACKSIZE(%esp) -#define STACK_X 32 + STACKSIZE(%esp) -#define STACK_INCX 36 + STACKSIZE(%esp) -#define Y 40 + STACKSIZE(%esp) -#define STACK_INCY 44 + STACKSIZE(%esp) -#define BUFFER 48 + STACKSIZE(%esp) +#define M 4 + STACKSIZE+ARGS(%esp) +#define N 8 + STACKSIZE+ARGS(%esp) +#define ALPHA 16 + STACKSIZE+ARGS(%esp) +#define A 24 + STACKSIZE+ARGS(%esp) +#define STACK_LDA 28 + STACKSIZE+ARGS(%esp) +#define STACK_X 32 + STACKSIZE+ARGS(%esp) +#define STACK_INCX 36 + STACKSIZE+ARGS(%esp) +#define Y 40 + STACKSIZE+ARGS(%esp) +#define STACK_INCY 44 + STACKSIZE+ARGS(%esp) +#define BUFFER 48 + STACKSIZE+ARGS(%esp) + +#define MMM 0+ARGS(%esp) +#define YY 4+ARGS(%esp) +#define AA 8+ARGS(%esp) #define I %eax #define J %ebx @@ -101,6 +106,8 @@ PROLOGUE + + subl $ARGS,%esp pushl %ebp pushl %edi pushl %esi @@ -108,6 +115,33 @@ PROFCODE + movl Y,J + movl J,YY # backup Y + movl A,J + movl J,AA # backup A + movl M,J + movl J,MMM # backup MM +.L0t: + xorl J,J + addl $1,J + sall $20,J + subl J,MMM + movl J,M + jge .L00t + ALIGN_4 + + movl MMM,%eax + addl J,%eax + jle .L999x + movl %eax,M + +.L00t: + movl AA,%eax + movl %eax,A + + movl YY,J + movl J,Y + movl STACK_LDA, LDA movl STACK_X, X movl STACK_INCX, INCX @@ -677,10 +711,22 @@ ALIGN_3 .L999: + movl M,J + leal (,J,SIZE),%eax + addl %eax,AA + movl YY,J + addl %eax,J + movl J,YY + jmp .L0t + ALIGN_4 + +.L999x: + popl %ebx popl %esi popl %edi popl %ebp + addl $ARGS,%esp ret EPILOGUE From 8b122ff9dc8a7d3e695283f0d5c6b4d576e9356e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 3 Jan 2013 01:47:31 +0800 Subject: [PATCH 13/30] Refs #176. Fixed make.inc overriding RANLIB bug when cross-compiling LAPACK. --- make.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/make.inc b/make.inc index 30004233f..01b9bde92 100644 --- a/make.inc +++ b/make.inc @@ -4,7 +4,7 @@ DRVOPTS = $(OPTS) LOADER = $(FORTRAN) TIMER = NONE ARCHFLAGS= -ru -RANLIB = ranlib +#RANLIB = ranlib BLASLIB = TMGLIB = tmglib.a EIGSRCLIB = eigsrc.a From 08bf6674d543db41c13053d1388602cb4d070373 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 5 Jan 2013 11:36:39 +0800 Subject: [PATCH 14/30] Refs #177. Fixed sgemv_t compiling bug on Win64. --- kernel/x86_64/sgemv_t.S | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index 06970a055..f516f08af 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -1,4 +1,5 @@ -/*********************************************************************/ + ;; 2c +1 /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ @@ -75,8 +76,8 @@ #define STACK_Y 72 + STACKSIZE(%rsp) #define STACK_INCY 80 + STACKSIZE(%rsp) #define STACK_BUFFER 88 + STACKSIZE(%rsp) -#defien MMM 216(%rsp) -#defien NN 224(%rsp) +#define MMM 216(%rsp) +#define NN 224(%rsp) #define AA 232(%rsp) #define LDAX 240(%rsp) @@ -137,8 +138,10 @@ movq OLD_M, MMM movq OLD_N, NN - movq OLD_A, AA - movq OLD_LDA, LDAX + movq OLD_A, X + movq X, AA + movq OLD_LDA, X + movq X, LDAX movq OLD_X, X #else movq OLD_M, MMM From 99d1978df7d9968db1e1f7ed147f67a9ec799d95 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 12 Jan 2013 12:31:14 +0800 Subject: [PATCH 15/30] Fixed #180. the typos in kernel/x86_64/sgemv_t.S --- kernel/x86_64/sgemv_t.S | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/x86_64/sgemv_t.S b/kernel/x86_64/sgemv_t.S index f516f08af..854e0f295 100644 --- a/kernel/x86_64/sgemv_t.S +++ b/kernel/x86_64/sgemv_t.S @@ -1,5 +1,3 @@ - ;; 2c -1 /* Copyright 2009, 2010 The University of Texas at Austin. */ /* All rights reserved. */ /* */ From 200e4acf152f11444cf32f8a2a93fde0bc700e9d Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Mon, 25 Jun 2012 13:51:46 +0200 Subject: [PATCH 16/30] cblas: typedef enums for improved compatibility with Intel MKL. Netlib style: enum CBLAS_XYZ {X=1, Y=2, Z=3}; Intel MKL style: typedef enum {X=1, Y=2, Z=3} CBLAS_XYZ; With this hybrid style, code written in the latter form won't need any modifications to be built with OpenBLAS. This change should not affect existing code, although a warning may be emitted for C code which does the following (does not occur with C++): typedef enum CBLAS_XYZ CBLAS_XYZ; warning: redefinition of typedef 'CBLAS_XYZ' [-pedantic] --- cblas.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/cblas.h b/cblas.h index e9664fe79..5d50238e1 100644 --- a/cblas.h +++ b/cblas.h @@ -18,11 +18,11 @@ char* openblas_get_config(void); #define CBLAS_INDEX size_t -enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; -enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114}; -enum CBLAS_UPLO {CblasUpper=121, CblasLower=122}; -enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132}; -enum CBLAS_SIDE {CblasLeft=141, CblasRight=142}; +typedef enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102} CBLAS_ORDER; +typedef enum CBLAS_TRANSPOSE {CblasNoTrans=111, CblasTrans=112, CblasConjTrans=113, CblasConjNoTrans=114} CBLAS_TRANSPOSE; +typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; +typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; +typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); From 0b08f7479e26ce0ef8e076185bb89f16479335e9 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 20 Jan 2013 21:22:12 +0800 Subject: [PATCH 17/30] Refs #154. Fixed gemv_t bug about overflow 16MB buffer on x86. --- kernel/x86/gemv_t_sse.S | 4 +++- kernel/x86/gemv_t_sse2.S | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index fa6cfc50b..326584bbc 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -142,7 +142,9 @@ .L0t: xorl J,J addl $1,J - sall $21,J # J=2^22 + sall $22,J # J=2^24*sizeof(float)=buffer size(16MB) + subl $8, J # Don't use last 8 float in the buffer. + # Now, split M by block J subl J,MMM # MMM=MMM-J movl J,M jge .L00t diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index d46d7e43e..60d6ef270 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -128,7 +128,9 @@ .L0t: xorl J,J addl $1,J - sall $22,J # J=2^22 + sall $21,J # J=2^21*sizeof(double)=buffer size(16MB) + subl $4, J # Don't use last 4 double in the buffer. + # Now, split M by block J subl J,MMM # MMM=MMM-J movl J,M jge .L00t From 4db6660de4756b25d7b71c00d7893f2b15587f1c Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sun, 20 Jan 2013 21:53:52 +0100 Subject: [PATCH 18/30] Refs #185. Add missing 'const' to declarations in . Thanks to Dan Povey! The 'const' modifications were done automatically using this scripts: https://kaldi.svn.sourceforge.net/svnroot/kaldi/sandbox/dan/tools/for_openblas --- Makefile | 2 +- Makefile.getarch => Makefile.prebuild | 7 +- Makefile.system | 2 +- cblas.h | 429 +++++++++++++------------- common.h | 3 +- 5 files changed, 224 insertions(+), 219 deletions(-) rename Makefile.getarch => Makefile.prebuild (81%) diff --git a/Makefile b/Makefile index 39e3bbd65..a84b54d4b 100644 --- a/Makefile +++ b/Makefile @@ -314,7 +314,7 @@ clean :: #endif @$(MAKE) -C reference clean @rm -f *.$(LIBSUFFIX) *.so *~ *.exe getarch getarch_2nd *.dll *.lib *.$(SUFFIX) *.dwf $(LIBPREFIX).$(LIBSUFFIX) $(LIBPREFIX)_p.$(LIBSUFFIX) $(LIBPREFIX).so.$(MAJOR_VERSION) *.lnk myconfig.h - @rm -f Makefile.conf config.h Makefile_kernel.conf config_kernel.h st* *.dylib + @rm -f Makefile.conf config.h cblas_noconst.h Makefile_kernel.conf config_kernel.h st* *.dylib @if test -d $(NETLIB_LAPACK_DIR); then \ echo deleting $(NETLIB_LAPACK_DIR); \ rm -rf $(NETLIB_LAPACK_DIR) ;\ diff --git a/Makefile.getarch b/Makefile.prebuild similarity index 81% rename from Makefile.getarch rename to Makefile.prebuild index dadfb5b1b..f4b0bb5af 100644 --- a/Makefile.getarch +++ b/Makefile.prebuild @@ -1,3 +1,5 @@ +# This is triggered by Makefile.system and runs before any of the code is built. + export BINARY export USE_OPENMP @@ -15,7 +17,7 @@ ifdef CPUIDEMU EXFLAGS = -DCPUIDEMU -DVENDOR=99 endif -all: getarch_2nd +all: getarch_2nd cblas_noconst.h ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) @@ -36,4 +38,7 @@ else $(HOSTCC) -I. $(CFLAGS) -DBUILD_KERNEL -o $(@F) getarch_2nd.c endif +cblas_noconst.h : cblas.h + sed -e "s/\bconst\b\s*//g" cblas.h > cblas_noconst.h + dummy: diff --git a/Makefile.system b/Makefile.system index 27f30fa61..239047f36 100644 --- a/Makefile.system +++ b/Makefile.system @@ -70,7 +70,7 @@ ifndef GOTOBLAS_MAKEFILE export GOTOBLAS_MAKEFILE = 1 # Generating Makefile.conf and config.h -DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.getarch CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) +DUMMY := $(shell $(MAKE) -C $(TOPDIR) -f Makefile.prebuild CC="$(CC)" FC="$(FC)" HOSTCC="$(HOSTCC)" CFLAGS="$(GETARCH_FLAGS)" BINARY=$(BINARY) USE_OPENMP=$(USE_OPENMP) TARGET_CORE=$(TARGET_CORE) all) ifndef TARGET_CORE include $(TOPDIR)/Makefile.conf diff --git a/cblas.h b/cblas.h index 5d50238e1..501e7d0d1 100644 --- a/cblas.h +++ b/cblas.h @@ -24,271 +24,270 @@ typedef enum CBLAS_UPLO {CblasUpper=121, CblasLower=122} CBLAS_UPLO; typedef enum CBLAS_DIAG {CblasNonUnit=131, CblasUnit=132} CBLAS_DIAG; typedef enum CBLAS_SIDE {CblasLeft=141, CblasRight=142} CBLAS_SIDE; -float cblas_sdsdot(blasint n, float, float *x, blasint incx, float *y, blasint incy); -double cblas_dsdot (blasint n, float *x, blasint incx, float *y, blasint incy); -float cblas_sdot(blasint n, float *x, blasint incx, float *y, blasint incy); -double cblas_ddot(blasint n, double *x, blasint incx, double *y, blasint incy); +float cblas_sdsdot(const blasint n, const float alpha, const float *x, const blasint incx, const float *y, const blasint incy); +double cblas_dsdot (const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +float cblas_sdot(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +double cblas_ddot(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); -openblas_complex_float cblas_cdotu(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_float cblas_cdotc(blasint n, float *x, blasint incx, float *y, blasint incy); -openblas_complex_double cblas_zdotu(blasint n, double *x, blasint incx, double *y, blasint incy); -openblas_complex_double cblas_zdotc(blasint n, double *x, blasint incx, double *y, blasint incy); +openblas_complex_float cblas_cdotu(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +openblas_complex_float cblas_cdotc(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy); +openblas_complex_double cblas_zdotu(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); +openblas_complex_double cblas_zdotc(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy); -void cblas_cdotu_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_cdotc_sub(blasint n, float *x, blasint incx, float *y, blasint incy, openblas_complex_float *ret); -void cblas_zdotu_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); -void cblas_zdotc_sub(blasint n, double *x, blasint incx, double *y, blasint incy, openblas_complex_double *ret); +void cblas_cdotu_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); +void cblas_cdotc_sub(const blasint n, const float *x, const blasint incx, const float *y, const blasint incy, openblas_complex_float *ret); +void cblas_zdotu_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); +void cblas_zdotc_sub(const blasint n, const double *x, const blasint incx, const double *y, const blasint incy, openblas_complex_double *ret); -float cblas_sasum (blasint n, float *x, blasint incx); -double cblas_dasum (blasint n, double *x, blasint incx); -float cblas_scasum(blasint n, float *x, blasint incx); -double cblas_dzasum(blasint n, double *x, blasint incx); +float cblas_sasum (const blasint n, const float *x, const blasint incx); +double cblas_dasum (const blasint n, const double *x, const blasint incx); +float cblas_scasum(const blasint n, const float *x, const blasint incx); +double cblas_dzasum(const blasint n, const double *x, const blasint incx); -float cblas_snrm2 (blasint N, float *X, blasint incX); -double cblas_dnrm2 (blasint N, double *X, blasint incX); -float cblas_scnrm2(blasint N, float *X, blasint incX); -double cblas_dznrm2(blasint N, double *X, blasint incX); +float cblas_snrm2 (const blasint N, const float *X, const blasint incX); +double cblas_dnrm2 (const blasint N, const double *X, const blasint incX); +float cblas_scnrm2(const blasint N, const float *X, const blasint incX); +double cblas_dznrm2(const blasint N, const double *X, const blasint incX); -CBLAS_INDEX cblas_isamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_idamax(blasint n, double *x, blasint incx); -CBLAS_INDEX cblas_icamax(blasint n, float *x, blasint incx); -CBLAS_INDEX cblas_izamax(blasint n, double *x, blasint incx); +CBLAS_INDEX cblas_isamax(const blasint n, const float *x, const blasint incx); +CBLAS_INDEX cblas_idamax(const blasint n, const double *x, const blasint incx); +CBLAS_INDEX cblas_icamax(const blasint n, const float *x, const blasint incx); +CBLAS_INDEX cblas_izamax(const blasint n, const double *x, const blasint incx); -void cblas_saxpy(blasint n, float, float *x, blasint incx, float *y, blasint incy); -void cblas_daxpy(blasint n, double, double *x, blasint incx, double *y, blasint incy); -void cblas_caxpy(blasint n, float *, float *x, blasint incx, float *y, blasint incy); -void cblas_zaxpy(blasint n, double *, double *x, blasint incx, double *y, blasint incy); +void cblas_saxpy(const blasint n, const float alpha, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_daxpy(const blasint n, const double alpha, const double *x, const blasint incx, double *y, const blasint incy); +void cblas_caxpy(const blasint n, const float *alpha, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_zaxpy(const blasint n, const double *alpha, const double *x, const blasint incx, double *y, const blasint incy); -void cblas_scopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dcopy(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_ccopy(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zcopy(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_scopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_dcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); +void cblas_ccopy(const blasint n, const float *x, const blasint incx, float *y, const blasint incy); +void cblas_zcopy(const blasint n, const double *x, const blasint incx, double *y, const blasint incy); -void cblas_sswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_dswap(blasint n, double *x, blasint incx, double *y, blasint incy); -void cblas_cswap(blasint n, float *x, blasint incx, float *y, blasint incy); -void cblas_zswap(blasint n, double *x, blasint incx, double *y, blasint incy); +void cblas_sswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); +void cblas_dswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); +void cblas_cswap(const blasint n, float *x, const blasint incx, float *y, const blasint incy); +void cblas_zswap(const blasint n, double *x, const blasint incx, double *y, const blasint incy); -void cblas_srot(blasint N, float *X, blasint incX, float *Y, blasint incY, float c, float s); -void cblas_drot(blasint N, double *X, blasint incX, double *Y, blasint incY, double c, double s); +void cblas_srot(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float c, const float s); +void cblas_drot(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double c, const double s); void cblas_srotg(float *a, float *b, float *c, float *s); void cblas_drotg(double *a, double *b, double *c, double *s); -void cblas_srotm(blasint N, float *X, blasint incX, float *Y, blasint incY, float *P); -void cblas_drotm(blasint N, double *X, blasint incX, double *Y, blasint incY, double *P); +void cblas_srotm(const blasint N, float *X, const blasint incX, float *Y, const blasint incY, const float *P); +void cblas_drotm(const blasint N, double *X, const blasint incX, double *Y, const blasint incY, const double *P); -void cblas_srotmg(float *d1, float *d2, float *b1, float b2, float *P); -void cblas_drotmg(double *d1, double *d2, double *b1, double b2, double *P); +void cblas_srotmg(float *d1, float *d2, float *b1, const float b2, float *P); +void cblas_drotmg(double *d1, double *d2, double *b1, const double b2, double *P); -void cblas_sscal(blasint N, float alpha, float *X, blasint incX); -void cblas_dscal(blasint N, double alpha, double *X, blasint incX); -void cblas_cscal(blasint N, float *alpha, float *X, blasint incX); -void cblas_zscal(blasint N, double *alpha, double *X, blasint incX); -void cblas_csscal(blasint N, float alpha, float *X, blasint incX); -void cblas_zdscal(blasint N, double alpha, double *X, blasint incX); +void cblas_sscal(const blasint N, const float alpha, float *X, const blasint incX); +void cblas_dscal(const blasint N, const double alpha, double *X, const blasint incX); +void cblas_cscal(const blasint N, const float *alpha, float *X, const blasint incX); +void cblas_zscal(const blasint N, const double *alpha, double *X, const blasint incX); +void cblas_csscal(const blasint N, const float alpha, float *X, const blasint incX); +void cblas_zdscal(const blasint N, const double alpha, double *X, const blasint incX); -void cblas_sgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float alpha, float *a, blasint lda, float *x, blasint incx, float beta, float *y, blasint incy); -void cblas_dgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double alpha, double *a, blasint lda, double *x, blasint incx, double beta, double *y, blasint incy); -void cblas_cgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - float *alpha, float *a, blasint lda, float *x, blasint incx, float *beta, float *y, blasint incy); -void cblas_zgemv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE trans, blasint m, blasint n, - double *alpha, double *a, blasint lda, double *x, blasint incx, double *beta, double *y, blasint incy); +void cblas_sgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const float alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float beta, float *y, const blasint incy); +void cblas_dgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const double alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double beta, double *y, const blasint incy); +void cblas_cgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const float *alpha, const float *a, const blasint lda, const float *x, const blasint incx, const float *beta, float *y, const blasint incy); +void cblas_zgemv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE trans, const blasint m, const blasint n, + const double *alpha, const double *a, const blasint lda, const double *x, const blasint incx, const double *beta, double *y, const blasint incy); -void cblas_sger (enum CBLAS_ORDER order, blasint M, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dger (enum CBLAS_ORDER order, blasint M, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cgeru(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_cgerc(enum CBLAS_ORDER order, blasint M, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_zgeru(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_zgerc(enum CBLAS_ORDER order, blasint M, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *A, blasint lda); +void cblas_sger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_dger (const enum CBLAS_ORDER order, const blasint M, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_cgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_cgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_zgeru(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_zgerc(const enum CBLAS_ORDER order, const blasint M, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); -void cblas_strsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_strsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztrsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_strmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_dtrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); -void cblas_ctrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, float *A, blasint lda, float *X, blasint incX); -void cblas_ztrmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, blasint N, double *A, blasint lda, double *X, blasint incX); +void cblas_strmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztrmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, const blasint N, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_ssyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_dsyr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); -void cblas_cher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A, blasint lda); -void cblas_zher(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *A, blasint lda); +void cblas_ssyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); +void cblas_dsyr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); +void cblas_cher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A, const blasint lda); +void cblas_zher(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *A, const blasint lda); -void cblas_ssyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,blasint N, float alpha, float *X, - blasint incX, float *Y, blasint incY, float *A, blasint lda); -void cblas_dsyr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, - blasint incX, double *Y, blasint incY, double *A, blasint lda); -void cblas_cher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, - float *Y, blasint incY, float *A, blasint lda); -void cblas_zher2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, - double *Y, blasint incY, double *A, blasint lda); +void cblas_ssyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo,const blasint N, const float alpha, const float *X, + const blasint incX, const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_dsyr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, + const blasint incX, const double *Y, const blasint incY, double *A, const blasint lda); +void cblas_cher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, + const float *Y, const blasint incY, float *A, const blasint lda); +void cblas_zher2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, + const double *Y, const blasint incY, double *A, const blasint lda); -void cblas_sgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float alpha, float *A, blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double alpha, double *A, blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_cgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zgbmv(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, blasint M, blasint N, - blasint KL, blasint KU, double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_sgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const float alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const double alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); +void cblas_cgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zgbmv(const enum CBLAS_ORDER order, const enum CBLAS_TRANSPOSE TransA, const blasint M, const blasint N, + const blasint KL, const blasint KU, const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_ssbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_ssbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const float alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dsbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, const double alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); -void cblas_stbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_stbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_stbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_dtbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); -void cblas_ctbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, float *A, blasint lda, float *X, blasint incX); -void cblas_ztbsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, blasint K, double *A, blasint lda, double *X, blasint incX); +void cblas_stbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_dtbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); +void cblas_ctbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const float *A, const blasint lda, float *X, const blasint incX); +void cblas_ztbsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const blasint K, const double *A, const blasint lda, double *X, const blasint incX); -void cblas_stpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); +void cblas_stpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_dtpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); +void cblas_ctpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_ztpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); -void cblas_stpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_dtpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); -void cblas_ctpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, float *Ap, float *X, blasint incX); -void cblas_ztpsv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, enum CBLAS_DIAG Diag, - blasint N, double *Ap, double *X, blasint incX); +void cblas_stpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_dtpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); +void cblas_ctpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const float *Ap, float *X, const blasint incX); +void cblas_ztpsv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_DIAG Diag, + const blasint N, const double *Ap, double *X, const blasint incX); -void cblas_ssymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *A, - blasint lda, float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dsymv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *A, - blasint lda, double *X, blasint incX, double beta, double *Y, blasint incY); -void cblas_chemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *A, - blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhemv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *A, - blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_ssymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dsymv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double beta, double *Y, const blasint incY); +void cblas_chemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *A, + const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhemv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *A, + const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_sspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *Ap, - float *X, blasint incX, float beta, float *Y, blasint incY); -void cblas_dspmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *Ap, - double *X, blasint incX, double beta, double *Y, blasint incY); +void cblas_sspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *Ap, + const float *X, const blasint incX, const float beta, float *Y, const blasint incY); +void cblas_dspmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *Ap, + const double *X, const blasint incX, const double beta, double *Y, const blasint incY); -void cblas_sspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Ap); -void cblas_dspr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Ap); +void cblas_sspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *Ap); +void cblas_dspr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, double *Ap); -void cblas_chpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *A); -void cblas_zhpr(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X,blasint incX, double *A); +void cblas_chpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, float *A); +void cblas_zhpr(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X,const blasint incX, double *A); -void cblas_sspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float alpha, float *X, blasint incX, float *Y, blasint incY, float *A); -void cblas_dspr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double alpha, double *X, blasint incX, double *Y, blasint incY, double *A); -void cblas_chpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, float *alpha, float *X, blasint incX, float *Y, blasint incY, float *Ap); -void cblas_zhpr2(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, double *alpha, double *X, blasint incX, double *Y, blasint incY, double *Ap); +void cblas_sspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *A); +void cblas_dspr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *A); +void cblas_chpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const float *alpha, const float *X, const blasint incX, const float *Y, const blasint incY, float *Ap); +void cblas_zhpr2(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const double *alpha, const double *X, const blasint incX, const double *Y, const blasint incY, double *Ap); -void cblas_chbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhbmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_chbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhbmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_chpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - float *alpha, float *Ap, float *X, blasint incX, float *beta, float *Y, blasint incY); -void cblas_zhpmv(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo, blasint N, - double *alpha, double *Ap, double *X, blasint incX, double *beta, double *Y, blasint incY); +void cblas_chpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, + const float *alpha, const float *Ap, const float *X, const blasint incX, const float *beta, float *Y, const blasint incY); +void cblas_zhpmv(const enum CBLAS_ORDER order, const enum CBLAS_UPLO Uplo, const blasint N, + const double *alpha, const double *Ap, const double *X, const blasint incX, const double *beta, double *Y, const blasint incY); -void cblas_sgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_cgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zgemm(enum CBLAS_ORDER Order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint M, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_sgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_cgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zgemm(const enum CBLAS_ORDER Order, const enum CBLAS_TRANSPOSE TransA, const enum CBLAS_TRANSPOSE TransB, const blasint M, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_ssymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsymm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_ssymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_csymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zsymm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_ssyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_dsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); -void cblas_csyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *beta, float *C, blasint ldc); -void cblas_zsyrk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *beta, double *C, blasint ldc); +void cblas_ssyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); +void cblas_dsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); +void cblas_csyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *beta, float *C, const blasint ldc); +void cblas_zsyrk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *beta, double *C, const blasint ldc); -void cblas_ssyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_dsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); -void cblas_csyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zsyr2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, - blasint N, blasint K, double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_ssyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_dsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); +void cblas_csyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zsyr2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, + const blasint N, const blasint K, const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_strmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrmm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_strmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_dtrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); +void cblas_ctrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_ztrmm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); -void cblas_strsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_dtrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double alpha, double *A, blasint lda, double *B, blasint ldb); -void cblas_ctrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, float *alpha, float *A, blasint lda, float *B, blasint ldb); -void cblas_ztrsm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE TransA, - enum CBLAS_DIAG Diag, blasint M, blasint N, double *alpha, double *A, blasint lda, double *B, blasint ldb); +void cblas_strsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_dtrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double alpha, const double *A, const blasint lda, double *B, const blasint ldb); +void cblas_ctrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const float *alpha, const float *A, const blasint lda, float *B, const blasint ldb); +void cblas_ztrsm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE TransA, + const enum CBLAS_DIAG Diag, const blasint M, const blasint N, const double *alpha, const double *A, const blasint lda, double *B, const blasint ldb); -void cblas_chemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float *beta, float *C, blasint ldc); -void cblas_zhemm(enum CBLAS_ORDER Order, enum CBLAS_SIDE Side, enum CBLAS_UPLO Uplo, blasint M, blasint N, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double *beta, double *C, blasint ldc); +void cblas_chemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float *beta, float *C, const blasint ldc); +void cblas_zhemm(const enum CBLAS_ORDER Order, const enum CBLAS_SIDE Side, const enum CBLAS_UPLO Uplo, const blasint M, const blasint N, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double *beta, double *C, const blasint ldc); -void cblas_cherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float alpha, float *A, blasint lda, float beta, float *C, blasint ldc); -void cblas_zherk(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double alpha, double *A, blasint lda, double beta, double *C, blasint ldc); +void cblas_cherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const float alpha, const float *A, const blasint lda, const float beta, float *C, const blasint ldc); +void cblas_zherk(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const double alpha, const double *A, const blasint lda, const double beta, double *C, const blasint ldc); -void cblas_cher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - float *alpha, float *A, blasint lda, float *B, blasint ldb, float beta, float *C, blasint ldc); -void cblas_zher2k(enum CBLAS_ORDER Order, enum CBLAS_UPLO Uplo, enum CBLAS_TRANSPOSE Trans, blasint N, blasint K, - double *alpha, double *A, blasint lda, double *B, blasint ldb, double beta, double *C, blasint ldc); +void cblas_cher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const float *alpha, const float *A, const blasint lda, const float *B, const blasint ldb, const float beta, float *C, const blasint ldc); +void cblas_zher2k(const enum CBLAS_ORDER Order, const enum CBLAS_UPLO Uplo, const enum CBLAS_TRANSPOSE Trans, const blasint N, const blasint K, + const double *alpha, const double *A, const blasint lda, const double *B, const blasint ldb, const double beta, double *C, const blasint ldc); void cblas_xerbla(blasint p, char *rout, char *form, ...); #ifdef __cplusplus } - #endif /* __cplusplus */ #endif diff --git a/common.h b/common.h index 003fde77f..4403af13d 100644 --- a/common.h +++ b/common.h @@ -557,7 +557,8 @@ typedef struct { #include "common_level3.h" #include "common_lapack.h" #ifdef CBLAS -#include "cblas.h" +/* This header file is generated from "cblas.h" (see Makefile.prebuild). */ +#include "cblas_noconst.h" #endif #ifndef ASSEMBLER From 8cdb79543823f1da894e18c6487a8d4d9cfdb1c3 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 22 Jan 2013 00:18:21 +0800 Subject: [PATCH 19/30] Refs #187. Use binary code for xgetbv, which is compatible with old compiler. --- cpuid_x86.c | 3 ++- driver/others/dynamic.c | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index b3352244b..a19dedeee 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -116,8 +116,9 @@ static inline int have_excpuid(void){ #ifndef NO_AVX static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv __asm__ __volatile__ - ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); } #endif diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 28fdd30d8..b6f27d0ad 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -78,8 +78,9 @@ extern gotoblas_t gotoblas_SANDYBRIDGE; #ifndef NO_AVX static inline void xgetbv(int op, int * eax, int * edx){ + //Use binary code for xgetbv __asm__ __volatile__ - ("xgetbv": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); + (".byte 0x0f, 0x01, 0xd0": "=a" (*eax), "=d" (*edx) : "c" (op) : "cc"); } #endif From 36e098296684b264f8b0979268e775519be1c81e Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 22 Jan 2013 00:29:54 +0800 Subject: [PATCH 20/30] Refs #187. Use perl to generate cblas_noconst.h instead of sed. Thank Dan Povey's patch. https://github.com/xianyi/OpenBLAS/issues/187 --- Makefile.prebuild | 2 +- getarch_2nd.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.prebuild b/Makefile.prebuild index f4b0bb5af..c7d0de70e 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -39,6 +39,6 @@ else endif cblas_noconst.h : cblas.h - sed -e "s/\bconst\b\s*//g" cblas.h > cblas_noconst.h + perl -ane ' s/\bconst\b\s*//g; print; ' < cblas.h > cblas_noconst.h dummy: diff --git a/getarch_2nd.c b/getarch_2nd.c index 5339af442..4bdd16a99 100644 --- a/getarch_2nd.c +++ b/getarch_2nd.c @@ -34,7 +34,7 @@ int main(int argc, char **argv) { #ifdef USE64BITINT printf("#define USE64BITINT\n"); #endif - printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", GEMM_MULTITHREAD_THRESHOLD); + printf("#define GEMM_MULTITHREAD_THRESHOLD\t%ld\n", (long int)GEMM_MULTITHREAD_THRESHOLD); } return 0; From d311236dfdefa41f31a2e7fefa548abf47f0461c Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 25 Jan 2013 16:18:27 +0800 Subject: [PATCH 21/30] Refs #189. Fixed the bug of s/cdot about invalid reading NAN on x86_64. --- kernel/x86_64/dot_sse.S | 7 ++++--- kernel/x86_64/zdot_sse.S | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/kernel/x86_64/dot_sse.S b/kernel/x86_64/dot_sse.S index 61c481064..985ce9fec 100644 --- a/kernel/x86_64/dot_sse.S +++ b/kernel/x86_64/dot_sse.S @@ -530,7 +530,7 @@ #endif movsd -32 * SIZE(Y), %xmm8 - pshufd $0x39, %xmm4, %xmm5 + pshufd $0x29, %xmm4, %xmm5 mulps %xmm8, %xmm5 addps %xmm5, %xmm3 @@ -750,7 +750,8 @@ xorps %xmm5, %xmm5 movhlps %xmm4, %xmm5 - mulps -32 * SIZE(Y), %xmm5 + movlps -32 * SIZE(Y), %xmm4 + mulps %xmm4, %xmm5 addps %xmm5, %xmm0 addq $2 * SIZE, X @@ -992,7 +993,7 @@ movsd -32 * SIZE(Y), %xmm8 movss %xmm5, %xmm4 - shufps $0x93, %xmm5, %xmm4 + shufps $0x93, %xmm4, %xmm4 mulps %xmm8, %xmm4 addps %xmm4, %xmm3 diff --git a/kernel/x86_64/zdot_sse.S b/kernel/x86_64/zdot_sse.S index 13804e0f8..e2f153ab3 100644 --- a/kernel/x86_64/zdot_sse.S +++ b/kernel/x86_64/zdot_sse.S @@ -699,7 +699,7 @@ movsd -32 * SIZE(X), %xmm4 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x39, %xmm8, %xmm8 + shufps $0x59, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -1336,7 +1336,7 @@ movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x93, %xmm8, %xmm8 + shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -1697,7 +1697,7 @@ movsd -32 * SIZE(Y), %xmm4 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x39, %xmm8, %xmm8 + shufps $0xa9, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 @@ -2024,7 +2024,7 @@ movss %xmm9, %xmm8 pshufd $0xb1, %xmm4, %xmm12 - shufps $0x93, %xmm8, %xmm8 + shufps $0x03, %xmm8, %xmm8 mulps %xmm8, %xmm4 addps %xmm4, %xmm0 mulps %xmm8, %xmm12 From 875d520ccfcfbb6a77cc5166b8bd562c3d111718 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Thu, 31 Jan 2013 08:48:27 +0100 Subject: [PATCH 22/30] Refs #193. cblas: move #include out of extern "C" block. Standard headers may contain C++ templates which are not permitted inside an extern "C" block. This might be the case when we include . --- cblas.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cblas.h b/cblas.h index 501e7d0d1..6684262e2 100644 --- a/cblas.h +++ b/cblas.h @@ -1,14 +1,14 @@ #ifndef CBLAS_H #define CBLAS_H +#include +#include "common.h" + #ifdef __cplusplus extern "C" { /* Assume C declarations for C++ */ #endif /* __cplusplus */ -#include -#include "common.h" - /*Set the number of threads on runtime.*/ void openblas_set_num_threads(int num_threads); void goto_set_num_threads(int num_threads); From 64ad8b9809e3768981d540d6f674a4642b86bf8d Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Fri, 1 Feb 2013 09:24:44 +0100 Subject: [PATCH 23/30] Refs #193. Don't use C99 complex numbers when building C++ code. --- common.h | 3 ++- openblas_config_template.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/common.h b/common.h index 4403af13d..a822b7182 100644 --- a/common.h +++ b/common.h @@ -390,7 +390,8 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || + (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 typedef float _Complex openblas_complex_float; typedef double _Complex openblas_complex_double; diff --git a/openblas_config_template.h b/openblas_config_template.h index a2b05696f..0d1186819 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -48,7 +48,8 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || __GNUC__ >= 3 +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || + (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 #include typedef float _Complex openblas_complex_float; From a9500d00793bc8a63939bfb634f46d4b1654a2ec Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Fri, 1 Feb 2013 09:34:12 +0100 Subject: [PATCH 24/30] Missing line continuation -- follow-up to last commit (64ad8b9809). --- common.h | 2 +- openblas_config_template.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/common.h b/common.h index a822b7182..d46a5230a 100644 --- a/common.h +++ b/common.h @@ -390,7 +390,7 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 typedef float _Complex openblas_complex_float; diff --git a/openblas_config_template.h b/openblas_config_template.h index 0d1186819..cf2c037cc 100644 --- a/openblas_config_template.h +++ b/openblas_config_template.h @@ -48,7 +48,7 @@ typedef int blasint; /* C99 supports complex floating numbers natively, which GCC also offers as an extension since version 3.0. If neither are available, use a compatible structure as fallback (see Clause 6.2.5.13 of the C99 standard). */ -#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || +#if (defined(__STDC_IEC_559_COMPLEX__) || __STDC_VERSION__ >= 199901L || \ (__GNUC__ >= 3 && !defined(__cplusplus))) #define OPENBLAS_COMPLEX_C99 #include From 5155e3f5090aa313ce342f4bc0880db63208c5a5 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 13 Feb 2013 16:05:58 +0800 Subject: [PATCH 25/30] Refs #174. Fixed the overflowing buffer bug of multithreading hbmv and sbmv. Instead of using thread 0 buffer, each thread uses its own sb buffer. Thus, it can avoid overflowing thread 0 buffer. --- driver/level2/sbmv_thread.c | 10 ++++++---- driver/others/blas_server.c | 1 + 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/driver/level2/sbmv_thread.c b/driver/level2/sbmv_thread.c index 222734d5e..7dfabfa81 100644 --- a/driver/level2/sbmv_thread.c +++ b/driver/level2/sbmv_thread.c @@ -65,7 +65,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a = (FLOAT *)args -> a; x = (FLOAT *)args -> b; - y = (FLOAT *)args -> c; lda = args -> lda; incx = args -> ldb; @@ -76,6 +75,10 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F n_from = 0; n_to = n; + //Use y as each thread's n* COMPSIZE elements in sb buffer + y = buffer; + buffer += ((COMPSIZE * n + 1023) & ~1023); + if (range_m) { n_from = *(range_m + 0); n_to = *(range_m + 1); @@ -83,7 +86,6 @@ static int sbmv_kernel(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, F a += n_from * lda * COMPSIZE; } - if (range_n) y += *range_n * COMPSIZE; if (incx != 1) { COPY_K(n, x, incx, buffer, 1); @@ -331,7 +333,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x if (num_cpu) { queue[0].sa = NULL; - queue[0].sb = buffer + num_cpu * (((n + 255) & ~255) + 16) * COMPSIZE; + queue[0].sb = buffer; queue[num_cpu - 1].next = NULL; exec_blas(num_cpu, queue); @@ -344,7 +346,7 @@ int CNAME(BLASLONG n, BLASLONG k, FLOAT *alpha, FLOAT *a, BLASLONG lda, FLOAT *x #else ONE, ZERO, #endif - buffer + range_n[i] * COMPSIZE, 1, buffer, 1, NULL, 0); + (FLOAT*)(queue[i].sb), 1, buffer, 1, NULL, 0); } AXPYU_K(n, 0, 0, diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index c51e681a5..2afcb742e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -385,6 +385,7 @@ static int blas_thread_server(void *arg){ + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } + queue->sb=sb; } #ifdef MONITOR From 4c2123c3343c523d5359853ac0ebe1bd6550a881 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Feb 2013 12:51:13 +0800 Subject: [PATCH 26/30] Fixed the overflowing bug in single thread cholesky factorization. --- lapack/potrf/potrf_L_single.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/lapack/potrf/potrf_L_single.c b/lapack/potrf/potrf_L_single.c index b88f8fc7a..d6d143623 100644 --- a/lapack/potrf/potrf_L_single.c +++ b/lapack/potrf/potrf_L_single.c @@ -66,7 +66,9 @@ static FLOAT dm1 = -1.; #endif #define GEMM_PQ MAX(GEMM_P, GEMM_Q) -#define REAL_GEMM_R (GEMM_R - GEMM_PQ) + +//leave some space for GEMM_ALIGN in sb2 +#define REAL_GEMM_R (GEMM_R - 2*GEMM_PQ) #if 0 #define SHARED_ARRAY @@ -220,7 +222,7 @@ blasint CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, FLOAT *sa, sa, sb2, a + (is + js * lda) * COMPSIZE, lda, - - is + js); + is - js); #endif } From 3cc6ae793eb9deba2d9b94e1326d2bc8b155f2f6 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 26 Feb 2013 00:48:21 +0800 Subject: [PATCH 27/30] Refs #174. Return sb pointer when OpenMP or Windows. --- driver/others/blas_server_omp.c | 1 + driver/others/blas_server_win32.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index c45856fd9..21bc5f78e 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -224,6 +224,7 @@ static void exec_threads(blas_queue_t *queue){ + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } + queue->sb=sb; } } diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 9cbd7e219..bd1069c5e 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -253,6 +253,7 @@ static DWORD WINAPI blas_thread_server(void *arg){ + GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B); } } + queue->sb=sb; } #ifdef MONITOR @@ -495,4 +496,4 @@ void goto_set_num_threads(int num_threads) void openblas_set_num_threads(int num) { goto_set_num_threads(num); -} \ No newline at end of file +} From d744c9590ae18706b40e151a9adf2070639909fb Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Fri, 1 Mar 2013 14:36:47 +0800 Subject: [PATCH 28/30] In OpenMP threading, preallocate the thread buffer instead of allocating the buffer every time. This patch improved the performance slightly. --- driver/others/blas_server_omp.c | 50 +++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 21bc5f78e..c567ed688 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -49,8 +49,12 @@ int blas_server_avail = 0; +static void * blas_thread_buffer[MAX_CPU_NUMBER]; + void goto_set_num_threads(int num_threads) { + int i=0; + if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; @@ -62,7 +66,19 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; omp_set_num_threads(blas_cpu_number); - + + //adjust buffer for each thread + for(i=0; i sa; sb = queue -> sb; @@ -189,7 +222,14 @@ static void exec_threads(blas_queue_t *queue){ if ((sa == NULL) && (sb == NULL) && ((queue -> mode & BLAS_PTHREAD) == 0)) { - buffer = blas_memory_alloc(2); + pos = omp_get_thread_num(); + buffer = blas_thread_buffer[pos]; + + //fallback + if(buffer==NULL) { + buffer = blas_memory_alloc(2); + release_flag=1; + } if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A); @@ -242,7 +282,7 @@ static void exec_threads(blas_queue_t *queue){ } - if (buffer != NULL) blas_memory_free(buffer); + if (release_flag) blas_memory_free(buffer); } From f1ce74ffdda640d31a58dd0b867e959672444811 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 2 Mar 2013 14:15:54 +0800 Subject: [PATCH 29/30] Improved the print when OS don't support AVX. --- driver/others/dynamic.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/driver/others/dynamic.c b/driver/others/dynamic.c index 6523abb4d..893dd0738 100644 --- a/driver/others/dynamic.c +++ b/driver/others/dynamic.c @@ -175,7 +175,7 @@ static gotoblas_t *get_coretype(void){ if(support_avx()) return &gotoblas_SANDYBRIDGE; else{ - fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); + fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } @@ -186,7 +186,7 @@ static gotoblas_t *get_coretype(void){ if(support_avx()) return &gotoblas_SANDYBRIDGE; else{ - fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Nehalem kernels.\n"); + fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Nehalem kernels as a fallback, which may give poorer performance.\n"); return &gotoblas_NEHALEM; //OS doesn't support AVX. Use old kernels. } } @@ -211,7 +211,7 @@ static gotoblas_t *get_coretype(void){ if(support_avx()) return &gotoblas_BULLDOZER; else{ - fprintf(stderr, "OpenBLAS : Your OS doesn't support AVX. Use Barcelona kernels.\n"); + fprintf(stderr, "OpenBLAS : Your OS does not support AVX instructions. OpenBLAS is using Barcelona kernels as a fallback, which may give poorer performance.\n"); return &gotoblas_BARCELONA; //OS doesn't support AVX. Use old kernels. } } else { From 0d0405b434808d8c8122474a4dcfa089f4962512 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 2 Mar 2013 14:22:27 +0800 Subject: [PATCH 30/30] Updated the doc for 0.2.6 version. --- Changelog.txt | 18 ++++++++++++++++++ Makefile.rule | 2 +- README.md | 2 +- 3 files changed, 20 insertions(+), 2 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index db0732c4f..54b11ad81 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,22 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.6 +2-Mar-2013 +common: + * Improved OpenMP performance slightly. (d744c9) + * Improved cblas.h compatibility with Intel MKL.(#185) + * Fixed the overflowing bug in single thread cholesky factorization. + * Fixed the overflowing buffer bug of multithreading hbmv and sbmv.(#174) + +x86/x86-64: + * Added AMD Bulldozer x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) + We will tune the performance in future. + * Auto-detect Intel Xeon E7540. + * Fixed the overflowing buffer bug of gemv. (#173) + * Fixed the bug of s/cdot about invalid reading NAN on x86_64. (#189) + +MIPS64: + ==================================================================== Version 0.2.5 26-Nov-2012 diff --git a/Makefile.rule b/Makefile.rule index 1240ab0ad..4e238575a 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.2.5 +VERSION = 0.2.6 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/README.md b/README.md index ce2688f03..ed5f196c7 100644 --- a/README.md +++ b/README.md @@ -44,7 +44,7 @@ Please read GotoBLAS_01Readme.txt - **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. - **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. - **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. -- **AMD Bulldozer**: Used GotoBLAS2 Barcelona codes. +- **AMD Bulldozer**: x86-64 S/DGEMM AVX kernels. (Thank Werner Saar) #### MIPS64: - **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2.