From 19a48b82cf3c4aa25659ea89dce494e2d78fed25 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 30 Mar 2012 20:01:03 +0800 Subject: [PATCH 01/46] Init Sandybridge codes based on Nehalem. --- Makefile.system | 5 +- TargetList.txt | 2 + cpuid.h | 3 + cpuid_x86.c | 11 +++- driver/others/parameter.c | 13 ++++- getarch.c | 14 +++++ kernel/setparam-ref.c | 16 ++++++ kernel/x86/KERNEL.SANDYBRIDGE | 1 + kernel/x86/gemm_kernel_2x4_penryn.S | 6 ++ kernel/x86/gemm_kernel_4x4_penryn.S | 8 ++- kernel/x86/gemv_n_sse.S | 2 +- kernel/x86/gemv_n_sse2.S | 2 +- kernel/x86/gemv_t_sse.S | 2 +- kernel/x86/gemv_t_sse2.S | 2 +- kernel/x86/trsm_kernel_LN_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LN_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_LT_4x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_2x4_penryn.S | 2 +- kernel/x86/trsm_kernel_RT_4x4_penryn.S | 2 +- kernel/x86/zgemm_kernel_1x2_penryn.S | 2 +- kernel/x86/zgemm_kernel_2x2_penryn.S | 2 +- kernel/x86/zgemv_n_sse.S | 2 +- kernel/x86/zgemv_n_sse2.S | 2 +- kernel/x86/zgemv_t_sse.S | 2 +- kernel/x86/zgemv_t_sse2.S | 2 +- kernel/x86/zscal_sse.S | 4 +- kernel/x86/zscal_sse2.S | 4 +- kernel/x86/ztrsm_kernel_LN_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_LT_2x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_1x2_penryn.S | 2 +- kernel/x86/ztrsm_kernel_RT_2x2_penryn.S | 2 +- kernel/x86_64/KERNEL.SANDYBRIDGE | 59 ++++++++++++++++++++ kernel/x86_64/dgemm_ncopy_2.S | 6 ++ kernel/x86_64/dgemm_ncopy_4.S | 2 +- kernel/x86_64/dgemm_ncopy_8.S | 6 ++ kernel/x86_64/dgemm_tcopy_2.S | 7 +++ kernel/x86_64/dgemm_tcopy_4.S | 6 ++ kernel/x86_64/dgemm_tcopy_8.S | 7 +++ kernel/x86_64/gemm_ncopy_2.S | 7 +++ kernel/x86_64/gemm_ncopy_4.S | 2 +- kernel/x86_64/gemm_tcopy_2.S | 7 +++ kernel/x86_64/gemm_tcopy_4.S | 2 +- kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zscal_sse.S | 2 +- kernel/x86_64/zscal_sse2.S | 4 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- l1param.h | 7 +++ l2param.h | 11 ++++ param.h | 74 +++++++++++++++++++++++++ 57 files changed, 309 insertions(+), 45 deletions(-) create mode 100644 kernel/x86/KERNEL.SANDYBRIDGE create mode 100644 kernel/x86_64/KERNEL.SANDYBRIDGE diff --git a/Makefile.system b/Makefile.system index 0fd223d60..7c6dce4a5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -226,11 +226,11 @@ endif ifdef DYNAMIC_ARCH ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO endif ifndef DYNAMIC_CORE @@ -740,6 +740,7 @@ export HAVE_SSE4_1 export HAVE_SSE4_2 export HAVE_SSE4A export HAVE_SSE5 +export HAVE_AVX export KERNELDIR export FUNCTION_PROFILE export TARGET_CORE diff --git a/TargetList.txt b/TargetList.txt index 1c3d7c5b9..9e0db4866 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -18,6 +18,7 @@ CORE2 PENRYN DUNNINGTON NEHALEM +SANDYBRIDGE ATOM b)AMD CPU: @@ -47,6 +48,7 @@ CELL 3.MIPS64 CPU: SICORTEX LOONGSON3A +LOONGSON3B 4.IA64 CPU: ITANIUM2 diff --git a/cpuid.h b/cpuid.h index 665ede077..c0f21698d 100644 --- a/cpuid.h +++ b/cpuid.h @@ -103,6 +103,7 @@ #define CORE_NEHALEM 17 #define CORE_ATOM 18 #define CORE_NANO 19 +#define CORE_SANDYBRIDGE 20 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -122,6 +123,7 @@ #define HAVE_MISALIGNSSE (1 << 15) #define HAVE_128BITFPU (1 << 16) #define HAVE_FASTMOVU (1 << 17) +#define HAVE_AVX (1 << 18) #define CACHE_INFO_L1_I 1 #define CACHE_INFO_L1_D 2 @@ -188,4 +190,5 @@ typedef struct { #define CPUTYPE_NSGEODE 41 #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 +#define CPUTYPE_SANDYBRIDGE 44 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index e183e9fc3..9916a662b 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -189,6 +189,7 @@ int get_cputype(int gettype){ if ((ecx & (1 << 9)) != 0) feature |= HAVE_SSSE3; if ((ecx & (1 << 19)) != 0) feature |= HAVE_SSE4_1; if ((ecx & (1 << 20)) != 0) feature |= HAVE_SSE4_2; + if ((ecx & (1 << 28)) != 0) feature |= HAVE_AVX; if (have_excpuid() >= 0x01) { cpuid(0x80000001, &eax, &ebx, &ecx, &edx); @@ -983,7 +984,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CPUTYPE_NEHALEM; + return CPUTYPE_SANDYBRIDGE; case 12: //Xeon Processor 5600 (Westmere-EP) return CPUTYPE_NEHALEM; @@ -1140,6 +1141,7 @@ static char *cpuname[] = { "NSGEODE", "VIAC3", "NANO", + "SANDYBRIDGE", }; static char *lowercpuname[] = { @@ -1186,6 +1188,7 @@ static char *lowercpuname[] = { "tms3x00", "nsgeode", "nano", + "sandybridge", }; static char *corename[] = { @@ -1209,6 +1212,7 @@ static char *corename[] = { "NEHALEM", "ATOM", "NANO", + "SANDYBRIDGE", }; static char *corename_lower[] = { @@ -1232,6 +1236,7 @@ static char *corename_lower[] = { "nehalem", "atom", "nano", + "sandybridge", }; @@ -1315,7 +1320,7 @@ int get_coretype(void){ return CORE_NEHALEM; case 10: //Intel Core i5-2000 /i7-2000 (Sandy Bridge) - return CORE_NEHALEM; + return CORE_SANDYBRIDGE; case 12: //Xeon Processor 5600 (Westmere-EP) return CORE_NEHALEM; @@ -1414,6 +1419,7 @@ void get_cpuconfig(void){ if (features & HAVE_SSE4_2) printf("#define HAVE_SSE4_2\n"); if (features & HAVE_SSE4A) printf("#define HAVE_SSE4A\n"); if (features & HAVE_SSE5 ) printf("#define HAVE_SSSE5\n"); + if (features & HAVE_AVX ) printf("#define HAVE_AVX\n"); if (features & HAVE_3DNOWEX) printf("#define HAVE_3DNOWEX\n"); if (features & HAVE_3DNOW) printf("#define HAVE_3DNOW\n"); if (features & HAVE_CFLUSH) printf("#define HAVE_CFLUSH\n"); @@ -1479,6 +1485,7 @@ void get_sse(void){ if (features & HAVE_SSE4_2) printf("HAVE_SSE4_2=1\n"); if (features & HAVE_SSE4A) printf("HAVE_SSE4A=1\n"); if (features & HAVE_SSE5 ) printf("HAVE_SSSE5=1\n"); + if (features & HAVE_AVX ) printf("HAVE_AVX=1\n"); if (features & HAVE_3DNOWEX) printf("HAVE_3DNOWEX=1\n"); if (features & HAVE_3DNOW) printf("HAVE_3DNOW=1\n"); diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 21f56e889..5ff1f2934 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -165,7 +165,7 @@ int get_L2_size(void){ #if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ - defined(CORE_NEHALEM) || defined(ATOM) || defined(GENERIC) + defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) cpuid(0x80000006, &eax, &ebx, &ecx, &edx); @@ -384,6 +384,17 @@ void blas_set_parameter(void){ #endif #endif +#if defined(SANDYBRIDGE) + sgemm_p = 1024; + dgemm_p = 512; + cgemm_p = 512; + zgemm_p = 256; +#ifdef EXPRECISION + qgemm_p = 256; + xgemm_p = 128; +#endif +#endif + #if defined(CORE_PRESCOTT) || defined(GENERIC) size >>= 6; diff --git a/getarch.c b/getarch.c index 5b614472a..d8f467f03 100644 --- a/getarch.c +++ b/getarch.c @@ -278,6 +278,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "NEHALEM" #endif +#ifdef FORCE_SANDYBRIDGE +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "SANDYBRIDGE" +#define ARCHCONFIG "-DSANDYBRIDGE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 " \ + "-DHAVE_CMOV -DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 -DHAVE_SSE4_1 -DHAVE_SSE4_2 -DHAVE_AVX" +#define LIBNAME "sandybridge" +#define CORENAME "SANDYBRIDGE" +#endif + #ifdef FORCE_ATOM #define FORCE #define FORCE_INTEL diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index aa45d47f8..e841bb171 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -746,6 +746,22 @@ static void init_parameter(void) { #endif #endif +#ifdef SANDYBRIDGE + +#ifdef DEBUG + fprintf(stderr, "Sandybridge\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef OPTERON #ifdef DEBUG diff --git a/kernel/x86/KERNEL.SANDYBRIDGE b/kernel/x86/KERNEL.SANDYBRIDGE new file mode 100644 index 000000000..65b03ae50 --- /dev/null +++ b/kernel/x86/KERNEL.SANDYBRIDGE @@ -0,0 +1 @@ +include $(KERNELDIR)/KERNEL.PENRYN diff --git a/kernel/x86/gemm_kernel_2x4_penryn.S b/kernel/x86/gemm_kernel_2x4_penryn.S index 263aea042..0bdc9185c 100644 --- a/kernel/x86/gemm_kernel_2x4_penryn.S +++ b/kernel/x86/gemm_kernel_2x4_penryn.S @@ -76,6 +76,12 @@ #define PREFETCHB prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE (8 * 1 - 4) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + #ifndef PREFETCH #define PREFETCH prefetcht0 #endif diff --git a/kernel/x86/gemm_kernel_4x4_penryn.S b/kernel/x86/gemm_kernel_4x4_penryn.S index 6775d1d18..2d51d9711 100644 --- a/kernel/x86/gemm_kernel_4x4_penryn.S +++ b/kernel/x86/gemm_kernel_4x4_penryn.S @@ -69,6 +69,12 @@ #define PREFETCHB prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE (16 * 1 - 8) +#define PREFETCHW prefetcht0 +#define PREFETCHB prefetcht0 +#endif + #ifndef PREFETCH #define PREFETCH prefetcht0 #endif @@ -262,7 +268,7 @@ movaps -16 * SIZE(AA), %xmm0 addps %xmm2, %xmm7 -#ifndef NEHALEM +#if !(defined(NEHALEM) || defined(SANDYBRIDGE)) PREFETCH (PREFETCHSIZE + 16) * SIZE(AA) #endif pshufd $0x93, %xmm1, %xmm2 diff --git a/kernel/x86/gemv_n_sse.S b/kernel/x86/gemv_n_sse.S index aae49a22d..0891657fa 100644 --- a/kernel/x86/gemv_n_sse.S +++ b/kernel/x86/gemv_n_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 4) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/gemv_n_sse2.S b/kernel/x86/gemv_n_sse2.S index 669c5ac6c..5f5fa5a51 100644 --- a/kernel/x86/gemv_n_sse2.S +++ b/kernel/x86/gemv_n_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/gemv_t_sse.S b/kernel/x86/gemv_t_sse.S index a4990116d..5bacb7da8 100644 --- a/kernel/x86/gemv_t_sse.S +++ b/kernel/x86/gemv_t_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 4) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/gemv_t_sse2.S b/kernel/x86/gemv_t_sse2.S index 9960b5c0c..c7e685dd8 100644 --- a/kernel/x86/gemv_t_sse2.S +++ b/kernel/x86/gemv_t_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/trsm_kernel_LN_2x4_penryn.S b/kernel/x86/trsm_kernel_LN_2x4_penryn.S index 6645b790e..ebd1377f1 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LN_4x4_penryn.S b/kernel/x86/trsm_kernel_LN_4x4_penryn.S index bb33918ef..6fa7d410e 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LN_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_2x4_penryn.S b/kernel/x86/trsm_kernel_LT_2x4_penryn.S index 55c69e49f..9ce4cd8d4 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_LT_4x4_penryn.S b/kernel/x86/trsm_kernel_LT_4x4_penryn.S index 11cc104e2..a1a35a7a5 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_LT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_2x4_penryn.S b/kernel/x86/trsm_kernel_RT_2x4_penryn.S index 01876a515..a5333640d 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_2x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/trsm_kernel_RT_4x4_penryn.S b/kernel/x86/trsm_kernel_RT_4x4_penryn.S index 40a9604d3..c3619ec3d 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_penryn.S +++ b/kernel/x86/trsm_kernel_RT_4x4_penryn.S @@ -62,7 +62,7 @@ #define PREFETCHSIZE (8 * 21 + 4) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHSIZE (8 * 21 + 4) #endif diff --git a/kernel/x86/zgemm_kernel_1x2_penryn.S b/kernel/x86/zgemm_kernel_1x2_penryn.S index 849361956..70b38dc79 100644 --- a/kernel/x86/zgemm_kernel_1x2_penryn.S +++ b/kernel/x86/zgemm_kernel_1x2_penryn.S @@ -64,7 +64,7 @@ #define PREFETCHB prefetcht0 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (8 * 1 - 4) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 diff --git a/kernel/x86/zgemm_kernel_2x2_penryn.S b/kernel/x86/zgemm_kernel_2x2_penryn.S index edd89b112..715eb4d4f 100644 --- a/kernel/x86/zgemm_kernel_2x2_penryn.S +++ b/kernel/x86/zgemm_kernel_2x2_penryn.S @@ -64,7 +64,7 @@ #define PREFETCHB prefetcht0 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE (16 * 1 + 8) #define PREFETCHW prefetcht0 #define PREFETCHB prefetcht0 diff --git a/kernel/x86/zgemv_n_sse.S b/kernel/x86/zgemv_n_sse.S index 340b9d375..8e28bb8e6 100644 --- a/kernel/x86/zgemv_n_sse.S +++ b/kernel/x86/zgemv_n_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/zgemv_n_sse2.S b/kernel/x86/zgemv_n_sse2.S index 441fbb0c0..607c51de0 100644 --- a/kernel/x86/zgemv_n_sse2.S +++ b/kernel/x86/zgemv_n_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/zgemv_t_sse.S b/kernel/x86/zgemv_t_sse.S index 4312ed173..fb98226ee 100644 --- a/kernel/x86/zgemv_t_sse.S +++ b/kernel/x86/zgemv_t_sse.S @@ -58,7 +58,7 @@ #define PREFETCHSIZE (16 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 7) diff --git a/kernel/x86/zgemv_t_sse2.S b/kernel/x86/zgemv_t_sse2.S index 78ca14cab..e2f391a82 100644 --- a/kernel/x86/zgemv_t_sse2.S +++ b/kernel/x86/zgemv_t_sse2.S @@ -45,7 +45,7 @@ #define PREFETCHSIZE (8 * 2) #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (8 * 7) diff --git a/kernel/x86/zscal_sse.S b/kernel/x86/zscal_sse.S index 849d787f6..53abb697b 100644 --- a/kernel/x86/zscal_sse.S +++ b/kernel/x86/zscal_sse.S @@ -55,7 +55,7 @@ #define XX %edi #define FLAG %ebp -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -697,7 +697,7 @@ cmpl $2 * SIZE, INCX jne .L120 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) PSHUFD2($0, %xmm0, %xmm6) PSHUFD2($0, %xmm1, %xmm1) diff --git a/kernel/x86/zscal_sse2.S b/kernel/x86/zscal_sse2.S index 5b1da61e6..26ef693a0 100644 --- a/kernel/x86/zscal_sse2.S +++ b/kernel/x86/zscal_sse2.S @@ -57,7 +57,7 @@ #include "l1param.h" -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -860,7 +860,7 @@ cmpl $2 * SIZE, INCX jne .L220 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) #ifdef HAVE_SSE3 movddup %xmm0, %xmm6 diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S index e5949aa6e..53e53c3ce 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S index b01498f78..3c056cdff 100644 --- a/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S index 3668ee2bb..1efa1fd25 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S index 13064166f..849afed73 100644 --- a/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_1x2_penryn.S @@ -63,7 +63,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S index ebff425c0..c1833abe2 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_penryn.S @@ -61,7 +61,7 @@ #define PREFETCHSIZE 84 #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht1 #define PREFETCHSIZE 84 #endif diff --git a/kernel/x86_64/KERNEL.SANDYBRIDGE b/kernel/x86_64/KERNEL.SANDYBRIDGE new file mode 100644 index 000000000..58a883243 --- /dev/null +++ b/kernel/x86_64/KERNEL.SANDYBRIDGE @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x8_nehalem.S +SGEMMINCOPY = gemm_ncopy_4.S +SGEMMITCOPY = gemm_tcopy_4.S +SGEMMONCOPY = ../generic/gemm_ncopy_8.c +SGEMMOTCOPY = ../generic/gemm_tcopy_8.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x8_nehalem.S +DGEMMINCOPY = dgemm_ncopy_2.S +DGEMMITCOPY = dgemm_tcopy_2.S +DGEMMONCOPY = dgemm_ncopy_8.S +DGEMMOTCOPY = dgemm_tcopy_8.S +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x4_nehalem.S +CGEMMINCOPY = zgemm_ncopy_2.S +CGEMMITCOPY = zgemm_tcopy_2.S +CGEMMONCOPY = ../generic/zgemm_ncopy_4.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x4_nehalem.S +ZGEMMINCOPY = zgemm_ncopy_1.S +ZGEMMITCOPY = zgemm_tcopy_1.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x8_nehalem.S +STRSMKERNEL_LT = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RN = trsm_kernel_LT_4x8_nehalem.S +STRSMKERNEL_RT = trsm_kernel_RT_4x8_nehalem.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x8_nehalem.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x8_nehalem.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x8_nehalem.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x4_nehalem.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x4_nehalem.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x4_nehalem.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x4_nehalem.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x4_nehalem.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x8_nehalem.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x8_nehalem.S diff --git a/kernel/x86_64/dgemm_ncopy_2.S b/kernel/x86_64/dgemm_ncopy_2.S index 2724cfe92..e4bde49bd 100644 --- a/kernel/x86_64/dgemm_ncopy_2.S +++ b/kernel/x86_64/dgemm_ncopy_2.S @@ -45,6 +45,12 @@ #define PREFETCHW prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef MOVAPS #define MOVAPS movaps #endif diff --git a/kernel/x86_64/dgemm_ncopy_4.S b/kernel/x86_64/dgemm_ncopy_4.S index 52115bd4d..1e4431664 100644 --- a/kernel/x86_64/dgemm_ncopy_4.S +++ b/kernel/x86_64/dgemm_ncopy_4.S @@ -45,7 +45,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/kernel/x86_64/dgemm_ncopy_8.S b/kernel/x86_64/dgemm_ncopy_8.S index 5d3627230..f35c3c5af 100644 --- a/kernel/x86_64/dgemm_ncopy_8.S +++ b/kernel/x86_64/dgemm_ncopy_8.S @@ -45,6 +45,12 @@ #define PREFETCHW prefetcht0 #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef MOVAPS #define MOVAPS movaps #endif diff --git a/kernel/x86_64/dgemm_tcopy_2.S b/kernel/x86_64/dgemm_tcopy_2.S index 06e59991d..b0b3590aa 100644 --- a/kernel/x86_64/dgemm_tcopy_2.S +++ b/kernel/x86_64/dgemm_tcopy_2.S @@ -52,6 +52,13 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/dgemm_tcopy_4.S b/kernel/x86_64/dgemm_tcopy_4.S index 8b81c41c0..85b0253d7 100644 --- a/kernel/x86_64/dgemm_tcopy_4.S +++ b/kernel/x86_64/dgemm_tcopy_4.S @@ -51,6 +51,12 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 12 +#define PREFETCH prefetcht0 +#define MOVUPS_A movups +#endif + #if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) #define PREFETCHSIZE 16 #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/dgemm_tcopy_8.S b/kernel/x86_64/dgemm_tcopy_8.S index 976033714..3d411cda5 100644 --- a/kernel/x86_64/dgemm_tcopy_8.S +++ b/kernel/x86_64/dgemm_tcopy_8.S @@ -46,6 +46,13 @@ #define MOVUPS_A movups #endif +#ifdef SANDYBRIDGE +#define PREFETCHSIZE 16 +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define MOVUPS_A movups +#endif + #ifdef MOVUPS_A #define MOVUPS_A1(OFF, ADDR, REGS) MOVUPS_A OFF(ADDR), REGS #define MOVUPS_A2(OFF, ADDR, BASE, SCALE, REGS) MOVUPS_A OFF(ADDR, BASE, SCALE), REGS diff --git a/kernel/x86_64/gemm_ncopy_2.S b/kernel/x86_64/gemm_ncopy_2.S index 72c2b9d20..06a0feae9 100644 --- a/kernel/x86_64/gemm_ncopy_2.S +++ b/kernel/x86_64/gemm_ncopy_2.S @@ -46,6 +46,13 @@ #define PREFETCHW prefetcht0 #endif +#if defined(SANDYBRIDGE) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ diff --git a/kernel/x86_64/gemm_ncopy_4.S b/kernel/x86_64/gemm_ncopy_4.S index a04542f6a..cac647fa0 100644 --- a/kernel/x86_64/gemm_ncopy_4.S +++ b/kernel/x86_64/gemm_ncopy_4.S @@ -46,7 +46,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/gemm_tcopy_2.S b/kernel/x86_64/gemm_tcopy_2.S index 8bfaca265..190cebb29 100644 --- a/kernel/x86_64/gemm_tcopy_2.S +++ b/kernel/x86_64/gemm_tcopy_2.S @@ -46,6 +46,13 @@ #define PREFETCHW prefetcht0 #endif +#if defined(SANDYBRIDGE) +#define RPREFETCHSIZE 12 +#define WPREFETCHSIZE (RPREFETCHSIZE * 2) +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#endif + #ifndef WINDOWS_ABI #define M ARG1 /* rdi */ diff --git a/kernel/x86_64/gemm_tcopy_4.S b/kernel/x86_64/gemm_tcopy_4.S index 877969ff5..c2308162f 100644 --- a/kernel/x86_64/gemm_tcopy_4.S +++ b/kernel/x86_64/gemm_tcopy_4.S @@ -46,7 +46,7 @@ #define PREFETCHW prefetcht0 #endif -#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) +#if defined(CORE2) || defined(PENRYN) || defined(DUNNINGTON) || defined(NEHALEM) || defined(SANDYBRIDGE) #define RPREFETCHSIZE 12 #define WPREFETCHSIZE (RPREFETCHSIZE * 4) #define PREFETCH prefetcht0 diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 901a5ad31..9db45a642 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index bfe7ebd69..ca03f86b7 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 2df76f1cb..01ad2d96e 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 12) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index bbba0b427..60c1ea778 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 12) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zscal_sse.S b/kernel/x86_64/zscal_sse.S index eb2092dc7..393988e73 100644 --- a/kernel/x86_64/zscal_sse.S +++ b/kernel/x86_64/zscal_sse.S @@ -685,7 +685,7 @@ cmpq $2 * SIZE, INCX jne .L120 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) pshufd $0, %xmm0, %xmm14 pshufd $0, %xmm1, %xmm1 diff --git a/kernel/x86_64/zscal_sse2.S b/kernel/x86_64/zscal_sse2.S index 23d2da73d..a553bbd39 100644 --- a/kernel/x86_64/zscal_sse2.S +++ b/kernel/x86_64/zscal_sse2.S @@ -55,7 +55,7 @@ #include "l1param.h" -#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) +#if defined(NEHALEM) || defined(PENRYN) || defined(DUNNINGTON) || defined(BARCELONA) || defined(NANO) || defined(SANDYBRIDGE) #define USE_PSHUFD #else #define USE_PSHUFD_HALF @@ -803,7 +803,7 @@ cmpq $2 * SIZE, INCX jne .L220 -#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) +#if defined(ALIGNED_ACCESS) && !defined(NEHALEM) && !defined(SANDYBRIDGE) movddup %xmm0, %xmm14 pxor %xmm15, %xmm15 diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index 39f0ff46f..fc54dc4a5 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 711907711..eae31b955 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 175912c71..4d6ad3326 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 3e4b17030..2623bfe6d 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -57,7 +57,7 @@ #define PREFETCHSIZE (16 * 24) #endif -#ifdef NEHALEM +#if defined(NEHALEM) || defined(SANDYBRIDGE) #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 #define PREFETCHSIZE (16 * 24) diff --git a/l1param.h b/l1param.h index f1d223ea7..61c61aa94 100644 --- a/l1param.h +++ b/l1param.h @@ -9,6 +9,13 @@ #define ALIGNED_ACCESS #endif +#ifdef SANDYBRIDGE +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE (128 * 12) +#define ALIGNED_ACCESS +#endif + #ifdef ATHLON #define PREFETCH prefetch #define PREFETCHW prefetchw diff --git a/l2param.h b/l2param.h index af9d17179..a371b2ded 100644 --- a/l2param.h +++ b/l2param.h @@ -63,6 +63,17 @@ #define PREFETCHSIZE 64 * 3 #endif +#ifdef SANDYBRIDGE +#define MOVUPS_A movups +#define MOVUPS_XL movups +#define MOVUPS_XS movups +#define MOVUPS_YL movups +#define MOVUPS_YS movups +#define PREFETCH prefetcht0 +#define PREFETCHW prefetcht0 +#define PREFETCHSIZE 64 * 3 +#endif + #ifdef OPTERON #define PREFETCH prefetch #define PREFETCHW prefetchw diff --git a/param.h b/param.h index 72d721d4e..53159a4fd 100644 --- a/param.h +++ b/param.h @@ -913,6 +913,80 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#ifdef SANDYBRIDGE + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 32 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SYMV_P 8 + +#define SWITCH_RATIO 4 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 + +#define SGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 8 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#define XGEMM_DEFAULT_UNROLL_N 1 +#endif + +#define SGEMM_DEFAULT_P 504 +#define SGEMM_DEFAULT_R sgemm_r + +#define DGEMM_DEFAULT_P 504 +#define DGEMM_DEFAULT_R dgemm_r + +#define QGEMM_DEFAULT_P 504 +#define QGEMM_DEFAULT_R qgemm_r + +#define CGEMM_DEFAULT_P 252 +#define CGEMM_DEFAULT_R cgemm_r + +#define ZGEMM_DEFAULT_P 252 +#define ZGEMM_DEFAULT_R zgemm_r + +#define XGEMM_DEFAULT_P 252 +#define XGEMM_DEFAULT_R xgemm_r + +#define SGEMM_DEFAULT_Q 512 +#define DGEMM_DEFAULT_Q 256 +#define QGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 512 +#define ZGEMM_DEFAULT_Q 256 +#define XGEMM_DEFAULT_Q 128 + +#define GETRF_FACTOR 0.72 + +#endif + + + #ifdef ATOM #define SNUMOPT 2 From 8218cbea2a2b706775c3c302ea1a4c361bd40bcc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20Ebersp=C3=A4cher?= Date: Wed, 2 May 2012 11:33:06 +0200 Subject: [PATCH 02/46] Add Xianyi's patch for segfaults on kernel 2.6.32 and add documentation accordingly. --- GotoBLAS_03FAQ.txt | 8 ++++++++ segfaults.patch | 27 +++++++++++++++++++++++++++ 2 files changed, 35 insertions(+) create mode 100644 segfaults.patch diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt index b6033fe53..b45e6d095 100644 --- a/GotoBLAS_03FAQ.txt +++ b/GotoBLAS_03FAQ.txt @@ -90,6 +90,14 @@ number of threads will consume extra resource. I recommend you to specify minimum number of threads. +1.9 Q I have segfaults when I compile with USE_OPENMP=1. What's wrong? + + A This may be related to a bug in the Linux kernel 2.6.32. Try applying + the patch segaults.patch using + + git am segfaults.patch + + and see if the crashes persist. 2. Architecture Specific issue or Implementation diff --git a/segfaults.patch b/segfaults.patch new file mode 100644 index 000000000..9585fa04b --- /dev/null +++ b/segfaults.patch @@ -0,0 +1,27 @@ +From ac40907baa90a0acc78139762ffa3c6f09274236 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Alexander=20Ebersp=C3=A4cher?= +Date: Wed, 2 May 2012 11:22:52 +0200 +Subject: [PATCH] Fix segfaults with kernel 2.6.32. This comes at the price of many compiler warnings. + +--- + common_linux.h | 4 ++-- + 1 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/common_linux.h b/common_linux.h +index b0381d9..40a94cb 100644 +--- a/common_linux.h ++++ b/common_linux.h +@@ -76,8 +76,8 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, + #endif + #else + //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 +-// unsigned long null_nodemask=0; +- return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); ++ unsigned long null_nodemask=0; ++ return syscall(SYS_mbind, addr, len, mode, &nodemask, maxnode, flags); + #endif + } + +-- +1.7.1 + From 4236d0d93836cd304f27646f18a28d309210e14d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexander=20Ebersp=C3=A4cher?= Date: Wed, 2 May 2012 12:03:07 +0200 Subject: [PATCH 03/46] Add note on compiler warnings for the segfaults patch. --- GotoBLAS_03FAQ.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt index b45e6d095..0213d8d58 100644 --- a/GotoBLAS_03FAQ.txt +++ b/GotoBLAS_03FAQ.txt @@ -97,7 +97,8 @@ git am segfaults.patch - and see if the crashes persist. + and see if the crashes persist. Note that this patch will lead to many + compiler warnings. 2. Architecture Specific issue or Implementation From 7f89edee3efce16b3a6db1a4382b432770acee21 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 3 May 2012 20:05:34 +0800 Subject: [PATCH 04/46] refs #103 Increase GEMM_MULTITHREAD_THRESHOLD to 50. --- Makefile.rule | 4 ++-- Makefile.system | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Makefile.rule b/Makefile.rule index b6cf98a3e..56cd63540 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -94,8 +94,8 @@ VERSION = 0.1.1 # If any gemm arguement m, n or k is less or equal this threshold, gemm will be execute # with single thread. You can use this flag to avoid the overhead of multi-threading -# in small matrix sizes. The default value is 4. -# GEMM_MULTITHREAD_THRESHOLD = 4 +# in small matrix sizes. The default value is 50. +# GEMM_MULTITHREAD_THRESHOLD = 50 # If you need santy check by comparing reference BLAS. It'll be very # slow (Not implemented yet). diff --git a/Makefile.system b/Makefile.system index e2fe9f730..ef2e8fcdd 100644 --- a/Makefile.system +++ b/Makefile.system @@ -45,7 +45,7 @@ GETARCH_FLAGS += -DUSE64BITINT endif ifndef GEMM_MULTITHREAD_THRESHOLD -GEMM_MULTITHREAD_THRESHOLD=4 +GEMM_MULTITHREAD_THRESHOLD=50 endif GETARCH_FLAGS += -DGEMM_MULTITHREAD_THRESHOLD=$(GEMM_MULTITHREAD_THRESHOLD) From e7846547be05ad548e748e09403eeee5ca5e7a24 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 7 May 2012 16:38:44 +0800 Subject: [PATCH 05/46] Refs #85 #104. Disable my_bind to fix this segfault issue. --- segfaults.patch | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/segfaults.patch b/segfaults.patch index 9585fa04b..f68d0438d 100644 --- a/segfaults.patch +++ b/segfaults.patch @@ -11,14 +11,14 @@ diff --git a/common_linux.h b/common_linux.h index b0381d9..40a94cb 100644 --- a/common_linux.h +++ b/common_linux.h -@@ -76,8 +76,8 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, +@@ -76,9 +76,8 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, #endif #else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 -// unsigned long null_nodemask=0; - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); + unsigned long null_nodemask=0; -+ return syscall(SYS_mbind, addr, len, mode, &nodemask, maxnode, flags); ++ return 0; #endif } From dee74174ff1df9de22979fa4a76aef5272aeeb70 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 8 May 2012 23:50:46 +0800 Subject: [PATCH 06/46] Refs #85 #104. Use patch instead of git to apply this segfaults.patch. --- GotoBLAS_03FAQ.txt | 2 +- segfaults.patch | 25 +++++-------------------- 2 files changed, 6 insertions(+), 21 deletions(-) diff --git a/GotoBLAS_03FAQ.txt b/GotoBLAS_03FAQ.txt index 0213d8d58..be623d608 100644 --- a/GotoBLAS_03FAQ.txt +++ b/GotoBLAS_03FAQ.txt @@ -95,7 +95,7 @@ A This may be related to a bug in the Linux kernel 2.6.32. Try applying the patch segaults.patch using - git am segfaults.patch + patch < segfaults.patch and see if the crashes persist. Note that this patch will lead to many compiler warnings. diff --git a/segfaults.patch b/segfaults.patch index f68d0438d..375ab766c 100644 --- a/segfaults.patch +++ b/segfaults.patch @@ -1,27 +1,12 @@ -From ac40907baa90a0acc78139762ffa3c6f09274236 Mon Sep 17 00:00:00 2001 -From: =?UTF-8?q?Alexander=20Ebersp=C3=A4cher?= -Date: Wed, 2 May 2012 11:22:52 +0200 -Subject: [PATCH] Fix segfaults with kernel 2.6.32. This comes at the price of many compiler warnings. - ---- - common_linux.h | 4 ++-- - 1 files changed, 2 insertions(+), 2 deletions(-) - -diff --git a/common_linux.h b/common_linux.h -index b0381d9..40a94cb 100644 ---- a/common_linux.h -+++ b/common_linux.h -@@ -76,9 +76,8 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, - #endif +diff -ruN common_linux.h.orig common_linux.h +--- common_linux.h.orig 2012-04-23 11:27:55.000000000 +0800 ++++ common_linux.h 2012-05-08 23:43:00.000000000 +0800 +@@ -77,7 +77,7 @@ #else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 --// unsigned long null_nodemask=0; + // unsigned long null_nodemask=0; - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); -+ unsigned long null_nodemask=0; + return 0; #endif } --- -1.7.1 - From 52485e5fd08f5420c13c54bbf3342aef277ea0e3 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Thu, 10 May 2012 13:01:35 +0800 Subject: [PATCH 07/46] Refs #105. Export missing LAPACK functions in shared library. They are as following, slabad, dlabad, slacpy, dlacpy, slamch, dlamch, slartg, slartgp, slartgs, dlartg, dlartgp, dlartgs, slascl, dlascl, slaset, dlaset. --- exports/gensymbol | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 1f30d7b15..029dc8395 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -100,9 +100,10 @@ sggglm, sgghrd, sgglse, sggqrf, sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, - shsein, shseqr, slabrd, slacon, slacn2, + shsein, shseqr, slabad, slabrd, slacon, slacn2, slacpy, slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, + slamch, slangb, slange, slangt, slanhs, slansb, slansp, slansy, slantb, slantp, slantr, slanv2, slapll, slapmt, slapy2, slapy3, @@ -110,8 +111,9 @@ slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, slaqtr, slar1v, slar2v, ilaslr, ilaslc, slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, - slarrv, slartv, - slarz, slarzb, slarzt, slasy2, slasyf, + slarnv, slarrv, slartg, slartgp, slartgs, slartv, + slarz, slarzb, slarzt, slascl, slasy2, slasyf, + slaset, slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, sopgtr, sopmtr, sorg2l, sorg2r, sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, @@ -219,9 +221,10 @@ dggglm, dgghrd, dgglse, dggqrf, dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, - dhsein, dhseqr, dlabrd, dlacon, dlacn2, + dhsein, dhseqr, dlabad, dlabrd, dlacon, dlacn2, dlacpy, dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, + dlamch, dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, dlansy, dlantb, dlantp, dlantr, dlanv2, dlapll, dlapmt, dlapy2, dlapy3, @@ -229,8 +232,9 @@ dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, dlaqtr, dlar1v, dlar2v, iladlr, iladlc, dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, dlargv, - dlarrv, dlartv, - dlarz, dlarzb, dlarzt, dlasy2, dlasyf, + dlarnv, dlarrv, dlartg, dlartgp, dlartgs, dlartv, + dlarz, dlarzb, dlarzt, dlascl, dlasy2, dlasyf, + dlaset, dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, dopgtr, dopmtr, dorg2l, dorg2r, dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2, From 06e208c5c39911009f388275afe4097013d1abd2 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sun, 13 May 2012 11:43:29 +0800 Subject: [PATCH 08/46] Refs #106. Fixed wget and md5 bug on FreeBSD and NetBSD. --- Makefile | 3 ++- Makefile.system | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 8d78a844b..905d686a2 100644 --- a/Makefile +++ b/Makefile @@ -256,7 +256,8 @@ LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz lapack-3.4.1.tgz : ifndef NOFORTRAN -ifeq ($(OSNAME), Darwin) +#http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or +ifeq ($(OSNAME), $(filter $(OSNAME),Darwin FreeBSD NetBSD)) curl -O $(LAPACK_URL) else wget $(LAPACK_URL) diff --git a/Makefile.system b/Makefile.system index ef2e8fcdd..c9e74faa6 100644 --- a/Makefile.system +++ b/Makefile.system @@ -108,6 +108,14 @@ export MACOSX_DEPLOYMENT_TARGET=10.2 MD5SUM = md5 -r endif +ifeq ($(OSNAME), FreeBSD) +MD5SUM = md5 -r +endif + +ifeq ($(OSNAME), NetBSD) +MD5SUM = md5 -r +endif + ifeq ($(OSNAME), Linux) EXTRALIB += -lm endif From fc4927fa0f8821e65455113374b4aa2020501fb0 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Tue, 15 May 2012 23:58:22 +0200 Subject: [PATCH 09/46] Fixed #107. Export missing LAPACK auxiliary routines (ALLAUX, SCLAUX, DZLAUX) Added some documentation on how the symbol list is derived and synchronized with lapack-3.4.1 to minimize the differences. --- exports/gensymbol | 203 ++++++++++++++++++++++++++++++++-------------- 1 file changed, 144 insertions(+), 59 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 029dc8395..735b73f4e 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -73,6 +73,7 @@ ); @lapackobjs = ( + # These routines are provided by OpenBLAS. sgesv, dgesv, cgesv, zgesv, sgetf2, dgetf2, cgetf2, zgetf2, sgetrf, dgetrf, cgetrf, zgetrf, @@ -88,32 +89,85 @@ ); @lapackobjs2 = ( - sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, + # These routines are provided by LAPACK (reference implementation). + # + # This list is prepared by copying all routines listed in + # `lapack-3.4.1/SRC/Makefile` and replacing the '.o' suffix with a comma. + # Thereafter the following routines should be removed: + # - those provided by OpenBLAS (see @lapackobjs) + # - extra precision routines (see @lapack_extendedprecision_objs) + # Each of these have been marked individually with "already provided" or "excluded". + + # ALLAUX -- Auxiliary routines called from all precisions + # already provided by @blasobjs: xerbla, lsame + ilaenv, ieeeck, lsamen, xerbla_array, iparmq, + ilaprec, ilatrans, ilauplo, iladiag, chla_transtype, + ilaver, slamch, + + # SCLAUX -- Auxiliary routines called from both REAL and COMPLEX. + # excluded: second_$(TIMER) + sbdsdc, + sbdsqr, sdisna, slabad, slacpy, sladiv, slae2, slaebz, + slaed0, slaed1, slaed2, slaed3, slaed4, slaed5, slaed6, + slaed7, slaed8, slaed9, slaeda, slaev2, slagtf, + slagts, slamrg, slanst, + slapy2, slapy3, slarnv, + slarra, slarrb, slarrc, slarrd, slarre, slarrf, slarrj, + slarrk, slarrr, slaneg, + slartg, slaruv, slas2, slascl, + slasd0, slasd1, slasd2, slasd3, slasd4, slasd5, slasd6, + slasd7, slasd8, slasda, slasdq, slasdt, + slaset, slasq1, slasq2, slasq3, slasq4, slasq5, slasq6, + slasr, slasrt, slassq, slasv2, spttrf, sstebz, sstedc, + ssteqr, ssterf, slaisnan, sisnan, + slartgp, slartgs, + + # DZLAUX -- Auxiliary routines called from both DOUBLE and COMPLEX*16. + # excluded: dsecnd_$(TIMER) + dbdsdc, + dbdsqr, ddisna, dlabad, dlacpy, dladiv, dlae2, dlaebz, + dlaed0, dlaed1, dlaed2, dlaed3, dlaed4, dlaed5, dlaed6, + dlaed7, dlaed8, dlaed9, dlaeda, dlaev2, dlagtf, + dlagts, dlamrg, dlanst, + dlapy2, dlapy3, dlarnv, + dlarra, dlarrb, dlarrc, dlarrd, dlarre, dlarrf, dlarrj, + dlarrk, dlarrr, dlaneg, + dlartg, dlaruv, dlas2, dlascl, + dlasd0, dlasd1, dlasd2, dlasd3, dlasd4, dlasd5, dlasd6, + dlasd7, dlasd8, dlasda, dlasdq, dlasdt, + dlaset, dlasq1, dlasq2, dlasq3, dlasq4, dlasq5, dlasq6, + dlasr, dlasrt, dlassq, dlasv2, dpttrf, dstebz, dstedc, + dsteqr, dsterf, dlaisnan, disnan, + dlartgp, dlartgs, + dlamch, + + # SLASRC -- Single precision real LAPACK routines + # already provided by @lapackobjs: + # sgesv, sgetf2, slaswp, slauu2, slauum, spotf2, spotri, strti2, strtri + sgbbrd, sgbcon, sgbequ, sgbrfs, sgbsv, sgbsvx, sgbtf2, sgbtrf, sgbtrs, sgebak, sgebal, sgebd2, sgebrd, sgecon, sgeequ, sgees, sgeesx, sgeev, sgeevx, sgegs, sgegv, sgehd2, sgehrd, sgelq2, sgelqf, sgels, sgelsd, sgelss, sgelsx, sgelsy, sgeql2, sgeqlf, - sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, sgerq2, sgerqf, - sgesc2, sgesdd, sgesvd, sgesvx, sgetc2, - sgetri, + sgeqp3, sgeqpf, sgeqr2, sgeqr2p, sgeqrf, sgeqrfp, sgerfs, + sgerq2, sgerqf, sgesc2, sgesdd, sgesvd, sgesvx, + sgetc2, sgetri, sggbak, sggbal, sgges, sggesx, sggev, sggevx, sggglm, sgghrd, sgglse, sggqrf, - sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, + sggrqf, sggsvd, sggsvp, sgtcon, sgtrfs, sgtsv, sgtsvx, sgttrf, sgttrs, sgtts2, shgeqz, - shsein, shseqr, slabad, slabrd, slacon, slacn2, slacpy, + shsein, shseqr, slabrd, slacon, slacn2, slaein, slaexc, slag2, slags2, slagtm, slagv2, slahqr, slahrd, slahr2, slaic1, slaln2, slals0, slalsa, slalsd, - slamch, slangb, slange, slangt, slanhs, slansb, slansp, slansy, slantb, slantp, slantr, slanv2, - slapll, slapmt, slapy2, slapy3, + slapll, slapmt, slaqgb, slaqge, slaqp2, slaqps, slaqsb, slaqsp, slaqsy, slaqr0, slaqr1, slaqr2, slaqr3, slaqr4, slaqr5, slaqtr, slar1v, slar2v, ilaslr, ilaslc, slarf, slarfb, slarfg, slarfgp, slarft, slarfx, slargv, - slarnv, slarrv, slartg, slartgp, slartgs, slartv, - slarz, slarzb, slarzt, slascl, slasy2, slasyf, - slaset, + slarrv, slartv, + slarz, slarzb, slarzt, slasy2, slasyf, slatbs, slatdf, slatps, slatrd, slatrs, slatrz, slatzm, sopgtr, sopmtr, sorg2l, sorg2r, sorgbr, sorghr, sorgl2, sorglq, sorgql, sorgqr, sorgr2, @@ -121,19 +175,21 @@ sormbr, sormhr, sorml2, sormlq, sormql, sormqr, sormr2, sormr3, sormrq, sormrz, sormtr, spbcon, spbequ, spbrfs, spbstf, spbsv, spbsvx, - spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, - sposvx, spotrs, spstrf, spstf2, + spbtf2, spbtrf, spbtrs, spocon, spoequ, sporfs, sposv, + sposvx, spstrf, spstf2, sppcon, sppequ, spprfs, sppsv, sppsvx, spptrf, spptri, spptrs, sptcon, - spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, + spteqr, sptrfs, sptsv, sptsvx, spttrs, sptts2, srscl, ssbev, ssbevd, ssbevx, ssbgst, ssbgv, ssbgvd, ssbgvx, ssbtrd, sspcon, sspev, sspevd, sspevx, sspgst, sspgv, sspgvd, sspgvx, ssprfs, sspsv, sspsvx, ssptrd, ssptrf, ssptri, ssptrs, sstegr, sstein, sstev, sstevd, sstevr, - sstevx, ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, + sstevx, + ssycon, ssyev, ssyevd, ssyevr, ssyevx, ssygs2, ssygst, ssygv, ssygvd, ssygvx, ssyrfs, ssysv, ssysvx, ssytd2, ssytf2, ssytrd, ssytrf, ssytri, ssytri2, ssytri2x, - ssyswapr, ssytrs, ssytrs2, ssyconv, stbcon, + ssyswapr, ssytrs, ssytrs2, ssyconv, + stbcon, stbrfs, stbtrs, stgevc, stgex2, stgexc, stgsen, stgsja, stgsna, stgsy2, stgsyl, stpcon, stprfs, stptri, stptrs, @@ -146,26 +202,38 @@ sbbcsd, slapmr, sorbdb, sorcsd, sgeqrt, sgeqrt2, sgeqrt3, sgemqrt, stpqrt, stpqrt2, stpmqrt, stprfb, - + + # DSLASRC -- Double-single mixed precision real routines called from + # single, single-extra and double precision real LAPACK + # routines (i.e. from SLASRC, SXLASRC, DLASRC). + # + # already provided by @lapackobjs: + # sgetrs, spotrf, sgetrf + spotrs, + + # CLASRC -- Single precision complex LAPACK routines + # already provided by @blasobjs: csymv + # already provided by @lapackobjs: + # cgesv, cgetf2, claswp, clauu2, clauum, cpotf2, cpotri, ctrti2, ctrtri cbdsqr, cgbbrd, cgbcon, cgbequ, cgbrfs, cgbsv, cgbsvx, cgbtf2, cgbtrf, cgbtrs, cgebak, cgebal, cgebd2, cgebrd, cgecon, cgeequ, cgees, cgeesx, cgeev, cgeevx, cgegs, cgegv, cgehd2, cgehrd, cgelq2, cgelqf, cgels, cgelsd, cgelss, cgelsx, cgelsy, cgeql2, cgeqlf, cgeqp3, - cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, cgerq2, cgerqf, - cgesc2, cgesdd, cgesvd, cgesvx, cgetc2, - cgetri, + cgeqpf, cgeqr2, cgeqr2p, cgeqrf, cgeqrfp, cgerfs, + cgerq2, cgerqf, cgesc2, cgesdd, cgesvd, + cgesvx, cgetc2, cgetri, cggbak, cggbal, cgges, cggesx, cggev, cggevx, cggglm, cgghrd, cgglse, cggqrf, cggrqf, cggsvd, cggsvp, - cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, + cgtcon, cgtrfs, cgtsv, cgtsvx, cgttrf, cgttrs, cgtts2, chbev, chbevd, chbevx, chbgst, chbgv, chbgvd, chbgvx, chbtrd, checon, cheev, cheevd, cheevr, cheevx, chegs2, chegst, chegv, chegvd, chegvx, cherfs, chesv, chesvx, chetd2, chetf2, chetrd, - chetrf, chetri, chetri2, chetri2x, cheswapr, + chetrf, chetri, chetri2, chetri2x, cheswapr, chetrs, chetrs2, chgeqz, chpcon, chpev, chpevd, - chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, + chpevx, chpgst, chpgv, chpgvd, chpgvx, chprfs, chpsv, chpsvx, chptrd, chptrf, chptri, chptrs, chsein, chseqr, clabrd, clacgv, clacon, clacn2, clacp2, clacpy, clacrm, clacrt, cladiv, @@ -179,21 +247,22 @@ claqhb, claqhe, claqhp, claqp2, claqps, claqsb, claqr0, claqr1, claqr2, claqr3, claqr4, claqr5, claqsp, claqsy, clar1v, clar2v, ilaclr, ilaclc, - clarf, clarfb, clarfg, clarfgp, clarft, + clarf, clarfb, clarfg, clarft, clarfgp, clarfx, clargv, clarnv, clarrv, clartg, clartv, clarz, clarzb, clarzt, clascl, claset, clasr, classq, clasyf, clatbs, clatdf, clatps, clatrd, clatrs, clatrz, - clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, + clatzm, cpbcon, cpbequ, cpbrfs, cpbstf, cpbsv, cpbsvx, cpbtf2, cpbtrf, cpbtrs, cpocon, cpoequ, cporfs, - cposv, cposvx, cpotrs, cpstrf, cpstf2, + cposv, cposvx, cpstrf, cpstf2, cppcon, cppequ, cpprfs, cppsv, cppsvx, cpptrf, cpptri, cpptrs, cptcon, cpteqr, cptrfs, cptsv, cptsvx, cpttrf, cpttrs, cptts2, - crot, cspcon, cspmv, cspr, csprfs, cspsv, + crot, cspcon, cspmv, cspr, csprfs, cspsv, cspsvx, csptrf, csptri, csptrs, csrscl, cstedc, - cstegr, cstein, csteqr, csycon, - csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, - csytri2, csytri2x, csyswapr, - csytrs, csytrs2, csyconv, ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, + cstegr, cstein, csteqr, + csycon, + csyr, csyrfs, csysv, csysvx, csytf2, csytrf, csytri, csytri2, csytri2x, + csyswapr, csytrs, csytrs2, csyconv, + ctbcon, ctbrfs, ctbtrs, ctgevc, ctgex2, ctgexc, ctgsen, ctgsja, ctgsna, ctgsy2, ctgsyl, ctpcon, ctprfs, ctptri, ctptrs, ctrcon, ctrevc, ctrexc, ctrrfs, ctrsen, ctrsna, @@ -209,32 +278,42 @@ cgeqrt, cgeqrt2, cgeqrt3, cgemqrt, ctpqrt, ctpqrt2, ctpmqrt, ctprfb, + # ZCLASRC -- Double-single mixed precision complex routines called from + # single, single-extra and double precision complex LAPACK + # routines (i.e. from CLASRC, CXLASRC, ZLASRC). + # + # already provided by @lapackobjs: + # cgetrs, cpotrf, cgetrf + cpotrs, + + # DLASRC -- Double precision real LAPACK routines + # already provided by @lapackobjs: + # dgesv, dgetf2, dgetrs, dlaswp, dlauu2, dlauum, dpotf2, dpotrf, dpotri, + # dtrti2, dtrtri dgbbrd, dgbcon, dgbequ, dgbrfs, dgbsv, dgbsvx, dgbtf2, dgbtrf, dgbtrs, dgebak, dgebal, dgebd2, dgebrd, dgecon, dgeequ, dgees, dgeesx, dgeev, dgeevx, dgegs, dgegv, dgehd2, dgehrd, dgelq2, dgelqf, dgels, dgelsd, dgelss, dgelsx, dgelsy, dgeql2, dgeqlf, - dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, dgerq2, dgerqf, - dgesc2, dgesdd, dgesvd, dgesvx, dgetc2, - dgetri, + dgeqp3, dgeqpf, dgeqr2, dgeqr2p, dgeqrf, dgeqrfp, dgerfs, + dgerq2, dgerqf, dgesc2, dgesdd, dgesvd, dgesvx, + dgetc2, dgetri, dggbak, dggbal, dgges, dggesx, dggev, dggevx, dggglm, dgghrd, dgglse, dggqrf, - dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, + dggrqf, dggsvd, dggsvp, dgtcon, dgtrfs, dgtsv, dgtsvx, dgttrf, dgttrs, dgtts2, dhgeqz, - dhsein, dhseqr, dlabad, dlabrd, dlacon, dlacn2, dlacpy, + dhsein, dhseqr, dlabrd, dlacon, dlacn2, dlaein, dlaexc, dlag2, dlags2, dlagtm, dlagv2, dlahqr, dlahrd, dlahr2, dlaic1, dlaln2, dlals0, dlalsa, dlalsd, - dlamch, dlangb, dlange, dlangt, dlanhs, dlansb, dlansp, dlansy, dlantb, dlantp, dlantr, dlanv2, - dlapll, dlapmt, dlapy2, dlapy3, + dlapll, dlapmt, dlaqgb, dlaqge, dlaqp2, dlaqps, dlaqsb, dlaqsp, dlaqsy, dlaqr0, dlaqr1, dlaqr2, dlaqr3, dlaqr4, dlaqr5, dlaqtr, dlar1v, dlar2v, iladlr, iladlc, - dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, dlargv, - dlarnv, dlarrv, dlartg, dlartgp, dlartgs, dlartv, - dlarz, dlarzb, dlarzt, dlascl, dlasy2, dlasyf, - dlaset, + dlarf, dlarfb, dlarfg, dlarfgp, dlarft, dlarfx, + dlargv, dlarrv, dlartv, + dlarz, dlarzb, dlarzt, dlasy2, dlasyf, dlatbs, dlatdf, dlatps, dlatrd, dlatrs, dlatrz, dlatzm, dopgtr, dopmtr, dorg2l, dorg2r, dorgbr, dorghr, dorgl2, dorglq, dorgql, dorgqr, dorgr2, @@ -242,21 +321,22 @@ dormbr, dormhr, dorml2, dormlq, dormql, dormqr, dormr2, dormr3, dormrq, dormrz, dormtr, dpbcon, dpbequ, dpbrfs, dpbstf, dpbsv, dpbsvx, - dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, + dpbtf2, dpbtrf, dpbtrs, dpocon, dpoequ, dporfs, dposv, dposvx, dpotrs, dpstrf, dpstf2, dppcon, dppequ, dpprfs, dppsv, dppsvx, dpptrf, dpptri, dpptrs, dptcon, - dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, + dpteqr, dptrfs, dptsv, dptsvx, dpttrs, dptts2, drscl, dsbev, dsbevd, dsbevx, dsbgst, dsbgv, dsbgvd, dsbgvx, dsbtrd, dspcon, dspev, dspevd, dspevx, dspgst, dspgv, dspgvd, dspgvx, dsprfs, dspsv, dspsvx, dsptrd, dsptrf, dsptri, dsptrs, dstegr, dstein, dstev, dstevd, dstevr, - dstevx, dsycon, dsyev, dsyevd, dsyevr, + dstevx, + dsycon, dsyev, dsyevd, dsyevr, dsyevx, dsygs2, dsygst, dsygv, dsygvd, dsygvx, dsyrfs, dsysv, dsysvx, - dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytrs, dsytrs2, - dsytri2, dsytri2x, dsyswapr, dsyconv, dtbcon, - dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, + dsytd2, dsytf2, dsytrd, dsytrf, dsytri, dsytri2, dsytri2x, + dsyswapr, dsytrs, dsytrs2, dsyconv, + dtbcon, dtbrfs, dtbtrs, dtgevc, dtgex2, dtgexc, dtgsen, dtgsja, dtgsna, dtgsy2, dtgsyl, dtpcon, dtprfs, dtptri, dtptrs, dtrcon, dtrevc, dtrexc, dtrrfs, dtrsen, dtrsna, dtrsyl, @@ -270,6 +350,11 @@ dgeqrt, dgeqrt2, dgeqrt3, dgemqrt, dtpqrt, dtpqrt2, dtpmqrt, dtprfb, + # ZLASRC -- Double precision complex LAPACK routines + # already provided by @blasobjs: zsymv + # already provided by @lapackobjs: + # zgesv, zgetrs, zgetf2, zlaswp, zlauu2, zlauum, zpotf2, zpotrf, zpotri, + # ztrti2, ztrtri zbdsqr, zgbbrd, zgbcon, zgbequ, zgbrfs, zgbsv, zgbsvx, zgbtf2, zgbtrf, zgbtrs, zgebak, zgebal, zgebd2, zgebrd, zgecon, zgeequ, zgees, zgeesx, zgeev, zgeevx, @@ -281,14 +366,14 @@ zggbak, zggbal, zgges, zggesx, zggev, zggevx, zggglm, zgghrd, zgglse, zggqrf, zggrqf, zggsvd, zggsvp, - zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, + zgtcon, zgtrfs, zgtsv, zgtsvx, zgttrf, zgttrs, zgtts2, zhbev, zhbevd, zhbevx, zhbgst, zhbgv, zhbgvd, zhbgvx, zhbtrd, zhecon, zheev, zheevd, zheevr, zheevx, zhegs2, zhegst, zhegv, zhegvd, zhegvx, zherfs, zhesv, zhesvx, zhetd2, zhetf2, zhetrd, zhetrf, zhetri, zhetri2, zhetri2x, zheswapr, zhetrs, zhetrs2, zhgeqz, zhpcon, zhpev, zhpevd, - zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, + zhpevx, zhpgst, zhpgv, zhpgvd, zhpgvx, zhprfs, zhpsv, zhpsvx, zhptrd, zhptrf, zhptri, zhptrs, zhsein, zhseqr, zlabrd, zlacgv, zlacon, zlacn2, zlacp2, zlacpy, zlacrm, zlacrt, zladiv, @@ -304,22 +389,23 @@ zlaqr0, zlaqr1, zlaqr2, zlaqr3, zlaqr4, zlaqr5, zlaqsp, zlaqsy, zlar1v, zlar2v, ilazlr, ilazlc, zlarcm, zlarf, zlarfb, - zlarfg, zlarfgp, zlarft, + zlarfg, zlarft, zlarfgp, zlarfx, zlargv, zlarnv, zlarrv, zlartg, zlartv, - zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, + zlarz, zlarzb, zlarzt, zlascl, zlaset, zlasr, zlassq, zlasyf, zlatbs, zlatdf, zlatps, zlatrd, zlatrs, zlatrz, zlatzm, - zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, + zpbcon, zpbequ, zpbrfs, zpbstf, zpbsv, zpbsvx, zpbtf2, zpbtrf, zpbtrs, zpocon, zpoequ, zporfs, zposv, zposvx, zpotrs, zpstrf, zpstf2, zppcon, zppequ, zpprfs, zppsv, zppsvx, zpptrf, zpptri, zpptrs, zptcon, zpteqr, zptrfs, zptsv, zptsvx, zpttrf, zpttrs, zptts2, - zrot, zspcon, zspmv, zspr, zsprfs, zspsv, + zrot, zspcon, zspmv, zspr, zsprfs, zspsv, zspsvx, zsptrf, zsptri, zsptrs, zdrscl, zstedc, - zstegr, zstein, zsteqr, zsycon, - zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, - zsytri2, zsytri2x, zsyswapr, - zsytrs, zsytrs2, zsyconv, ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, + zstegr, zstein, zsteqr, + zsycon, + zsyr, zsyrfs, zsysv, zsysvx, zsytf2, zsytrf, zsytri, zsytri2, zsytri2x, + zsyswapr, zsytrs, zsytrs2, zsyconv, + ztbcon, ztbrfs, ztbtrs, ztgevc, ztgex2, ztgexc, ztgsen, ztgsja, ztgsna, ztgsy2, ztgsyl, ztpcon, ztprfs, ztptri, ztptrs, ztrcon, ztrevc, ztrexc, ztrrfs, ztrsen, ztrsna, @@ -336,7 +422,6 @@ zbbcsd, zlapmr, zunbdb, zuncsd, zgeqrt, zgeqrt2, zgeqrt3, zgemqrt, ztpqrt, ztpqrt2, ztpmqrt, ztprfb, - ); @lapack_extendedprecision_objs = ( From f404a177878eee3acea0a5934fecddc75caaf5f3 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Wed, 16 May 2012 11:24:24 +0200 Subject: [PATCH 10/46] Symbol list: document how LAPACKE exports are derived and synchronize with lapack-3.4.1 This change adds the missing LAPACKE_[zc]syr routines but does not remove any exported functions. --- exports/gensymbol | 422 +++++++++++++++++++++++++++++----------------- 1 file changed, 271 insertions(+), 151 deletions(-) diff --git a/exports/gensymbol b/exports/gensymbol index 735b73f4e..dbd559473 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -430,15 +430,170 @@ ); @lapackeobjs = ( - lapack_make_complex_double, - lapack_make_complex_float, + # LAPACK C interface routines. + # + # This list is prepared in a similar manner to @lapackobjs2, however the + # functions all begin with an uppercase prefix (with the exception of the + # make_complex_* routines). + # + # The functions corresponding to @(MATGEN_OBJ) and @(SRCX_OBJ) are not + # exported since the respective LAPACK routines are not built by default. + + # @(OBJ) from `lapack-3.4.1/lapacke/utils/Makefile` + LAPACKE_cgb_nancheck, + LAPACKE_cgb_trans, + LAPACKE_cge_nancheck, + LAPACKE_cge_trans, + LAPACKE_cgg_nancheck, + LAPACKE_cgg_trans, + LAPACKE_cgt_nancheck, + LAPACKE_chb_nancheck, + LAPACKE_chb_trans, + LAPACKE_che_nancheck, + LAPACKE_che_trans, + LAPACKE_chp_nancheck, + LAPACKE_chp_trans, + LAPACKE_chs_nancheck, + LAPACKE_chs_trans, LAPACKE_c_nancheck, + LAPACKE_cpb_nancheck, + LAPACKE_cpb_trans, + LAPACKE_cpf_nancheck, + LAPACKE_cpf_trans, + LAPACKE_cpo_nancheck, + LAPACKE_cpo_trans, + LAPACKE_cpp_nancheck, + LAPACKE_cpp_trans, + LAPACKE_cpt_nancheck, + LAPACKE_csp_nancheck, + LAPACKE_csp_trans, + LAPACKE_cst_nancheck, + LAPACKE_csy_nancheck, + LAPACKE_csy_trans, + LAPACKE_ctb_nancheck, + LAPACKE_ctb_trans, + LAPACKE_ctf_nancheck, + LAPACKE_ctf_trans, + LAPACKE_ctp_nancheck, + LAPACKE_ctp_trans, + LAPACKE_ctr_nancheck, + LAPACKE_ctr_trans, + LAPACKE_dgb_nancheck, + LAPACKE_dgb_trans, + LAPACKE_dge_nancheck, + LAPACKE_dge_trans, + LAPACKE_dgg_nancheck, + LAPACKE_dgg_trans, + LAPACKE_dgt_nancheck, + LAPACKE_dhs_nancheck, + LAPACKE_dhs_trans, + LAPACKE_d_nancheck, + LAPACKE_dpb_nancheck, + LAPACKE_dpb_trans, + LAPACKE_dpf_nancheck, + LAPACKE_dpf_trans, + LAPACKE_dpo_nancheck, + LAPACKE_dpo_trans, + LAPACKE_dpp_nancheck, + LAPACKE_dpp_trans, + LAPACKE_dpt_nancheck, + LAPACKE_dsb_nancheck, + LAPACKE_dsb_trans, + LAPACKE_dsp_nancheck, + LAPACKE_dsp_trans, + LAPACKE_dst_nancheck, + LAPACKE_dsy_nancheck, + LAPACKE_dsy_trans, + LAPACKE_dtb_nancheck, + LAPACKE_dtb_trans, + LAPACKE_dtf_nancheck, + LAPACKE_dtf_trans, + LAPACKE_dtp_nancheck, + LAPACKE_dtp_trans, + LAPACKE_dtr_nancheck, + LAPACKE_dtr_trans, + LAPACKE_lsame, + LAPACKE_sgb_nancheck, + LAPACKE_sgb_trans, + LAPACKE_sge_nancheck, + LAPACKE_sge_trans, + LAPACKE_sgg_nancheck, + LAPACKE_sgg_trans, + LAPACKE_sgt_nancheck, + LAPACKE_shs_nancheck, + LAPACKE_shs_trans, + LAPACKE_s_nancheck, + LAPACKE_spb_nancheck, + LAPACKE_spb_trans, + LAPACKE_spf_nancheck, + LAPACKE_spf_trans, + LAPACKE_spo_nancheck, + LAPACKE_spo_trans, + LAPACKE_spp_nancheck, + LAPACKE_spp_trans, + LAPACKE_spt_nancheck, + LAPACKE_ssb_nancheck, + LAPACKE_ssb_trans, + LAPACKE_ssp_nancheck, + LAPACKE_ssp_trans, + LAPACKE_sst_nancheck, + LAPACKE_ssy_nancheck, + LAPACKE_ssy_trans, + LAPACKE_stb_nancheck, + LAPACKE_stb_trans, + LAPACKE_stf_nancheck, + LAPACKE_stf_trans, + LAPACKE_stp_nancheck, + LAPACKE_stp_trans, + LAPACKE_str_nancheck, + LAPACKE_str_trans, + LAPACKE_xerbla, + LAPACKE_zgb_nancheck, + LAPACKE_zgb_trans, + LAPACKE_zge_nancheck, + LAPACKE_zge_trans, + LAPACKE_zgg_nancheck, + LAPACKE_zgg_trans, + LAPACKE_zgt_nancheck, + LAPACKE_zhb_nancheck, + LAPACKE_zhb_trans, + LAPACKE_zhe_nancheck, + LAPACKE_zhe_trans, + LAPACKE_zhp_nancheck, + LAPACKE_zhp_trans, + LAPACKE_zhs_nancheck, + LAPACKE_zhs_trans, + LAPACKE_z_nancheck, + LAPACKE_zpb_nancheck, + LAPACKE_zpb_trans, + LAPACKE_zpf_nancheck, + LAPACKE_zpf_trans, + LAPACKE_zpo_nancheck, + LAPACKE_zpo_trans, + LAPACKE_zpp_nancheck, + LAPACKE_zpp_trans, + LAPACKE_zpt_nancheck, + LAPACKE_zsp_nancheck, + LAPACKE_zsp_trans, + LAPACKE_zst_nancheck, + LAPACKE_zsy_nancheck, + LAPACKE_zsy_trans, + LAPACKE_ztb_nancheck, + LAPACKE_ztb_trans, + LAPACKE_ztf_nancheck, + LAPACKE_ztf_trans, + LAPACKE_ztp_nancheck, + LAPACKE_ztp_trans, + LAPACKE_ztr_nancheck, + LAPACKE_ztr_trans, + lapack_make_complex_float, + lapack_make_complex_double, + + # @(SRC_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` LAPACKE_cbbcsd, LAPACKE_cbbcsd_work, LAPACKE_cbdsqr, LAPACKE_cbdsqr_work, - LAPACKE_cgb_nancheck, - LAPACKE_cgb_trans, LAPACKE_cgbbrd, LAPACKE_cgbbrd_work, LAPACKE_cgbcon, @@ -457,8 +612,6 @@ LAPACKE_cgbtrf_work, LAPACKE_cgbtrs, LAPACKE_cgbtrs_work, - LAPACKE_cge_nancheck, - LAPACKE_cge_trans, LAPACKE_cgebak, LAPACKE_cgebak_work, LAPACKE_cgebal, @@ -533,8 +686,6 @@ LAPACKE_cgetri_work, LAPACKE_cgetrs, LAPACKE_cgetrs_work, - LAPACKE_cgg_nancheck, - LAPACKE_cgg_trans, LAPACKE_cggbak, LAPACKE_cggbak_work, LAPACKE_cggbal, @@ -561,7 +712,6 @@ LAPACKE_cggsvd_work, LAPACKE_cggsvp, LAPACKE_cggsvp_work, - LAPACKE_cgt_nancheck, LAPACKE_cgtcon, LAPACKE_cgtcon_work, LAPACKE_cgtrfs, @@ -574,8 +724,6 @@ LAPACKE_cgttrf_work, LAPACKE_cgttrs, LAPACKE_cgttrs_work, - LAPACKE_chb_nancheck, - LAPACKE_chb_trans, LAPACKE_chbev, LAPACKE_chbev_work, LAPACKE_chbevd, @@ -592,8 +740,6 @@ LAPACKE_chbgvx_work, LAPACKE_chbtrd, LAPACKE_chbtrd_work, - LAPACKE_che_nancheck, - LAPACKE_che_trans, LAPACKE_checon, LAPACKE_checon_work, LAPACKE_cheequb, @@ -640,8 +786,6 @@ LAPACKE_chfrk_work, LAPACKE_chgeqz, LAPACKE_chgeqz_work, - LAPACKE_chp_nancheck, - LAPACKE_chp_trans, LAPACKE_chpcon, LAPACKE_chpcon_work, LAPACKE_chpev, @@ -672,8 +816,6 @@ LAPACKE_chptri_work, LAPACKE_chptrs, LAPACKE_chptrs_work, - LAPACKE_chs_nancheck, - LAPACKE_chs_trans, LAPACKE_chsein, LAPACKE_chsein_work, LAPACKE_chseqr, @@ -710,8 +852,6 @@ LAPACKE_claswp_work, LAPACKE_clauum, LAPACKE_clauum_work, - LAPACKE_cpb_nancheck, - LAPACKE_cpb_trans, LAPACKE_cpbcon, LAPACKE_cpbcon_work, LAPACKE_cpbequ, @@ -728,16 +868,12 @@ LAPACKE_cpbtrf_work, LAPACKE_cpbtrs, LAPACKE_cpbtrs_work, - LAPACKE_cpf_nancheck, - LAPACKE_cpf_trans, LAPACKE_cpftrf, LAPACKE_cpftrf_work, LAPACKE_cpftri, LAPACKE_cpftri_work, LAPACKE_cpftrs, LAPACKE_cpftrs_work, - LAPACKE_cpo_nancheck, - LAPACKE_cpo_trans, LAPACKE_cpocon, LAPACKE_cpocon_work, LAPACKE_cpoequ, @@ -756,8 +892,6 @@ LAPACKE_cpotri_work, LAPACKE_cpotrs, LAPACKE_cpotrs_work, - LAPACKE_cpp_nancheck, - LAPACKE_cpp_trans, LAPACKE_cppcon, LAPACKE_cppcon_work, LAPACKE_cppequ, @@ -776,7 +910,6 @@ LAPACKE_cpptrs_work, LAPACKE_cpstrf, LAPACKE_cpstrf_work, - LAPACKE_cpt_nancheck, LAPACKE_cptcon, LAPACKE_cptcon_work, LAPACKE_cpteqr, @@ -791,8 +924,6 @@ LAPACKE_cpttrf_work, LAPACKE_cpttrs, LAPACKE_cpttrs_work, - LAPACKE_csp_nancheck, - LAPACKE_csp_trans, LAPACKE_cspcon, LAPACKE_cspcon_work, LAPACKE_csprfs, @@ -807,7 +938,6 @@ LAPACKE_csptri_work, LAPACKE_csptrs, LAPACKE_csptrs_work, - LAPACKE_cst_nancheck, LAPACKE_cstedc, LAPACKE_cstedc_work, LAPACKE_cstegr, @@ -818,16 +948,12 @@ LAPACKE_cstemr_work, LAPACKE_csteqr, LAPACKE_csteqr_work, - LAPACKE_csy_nancheck, - LAPACKE_csy_trans, LAPACKE_csycon, LAPACKE_csycon_work, LAPACKE_csyconv, LAPACKE_csyconv_work, LAPACKE_csyequb, LAPACKE_csyequb_work, - LAPACKE_csyr, - LAPACKE_csyr_work, LAPACKE_csyrfs, LAPACKE_csyrfs_work, LAPACKE_csysv, @@ -848,16 +974,12 @@ LAPACKE_csytrs2, LAPACKE_csytrs2_work, LAPACKE_csytrs_work, - LAPACKE_ctb_nancheck, - LAPACKE_ctb_trans, LAPACKE_ctbcon, LAPACKE_ctbcon_work, LAPACKE_ctbrfs, LAPACKE_ctbrfs_work, LAPACKE_ctbtrs, LAPACKE_ctbtrs_work, - LAPACKE_ctf_nancheck, - LAPACKE_ctf_trans, LAPACKE_ctfsm, LAPACKE_ctfsm_work, LAPACKE_ctftri, @@ -878,8 +1000,6 @@ LAPACKE_ctgsna_work, LAPACKE_ctgsyl, LAPACKE_ctgsyl_work, - LAPACKE_ctp_nancheck, - LAPACKE_ctp_trans, LAPACKE_ctpcon, LAPACKE_ctpcon_work, LAPACKE_ctpmqrt, @@ -900,8 +1020,6 @@ LAPACKE_ctpttf_work, LAPACKE_ctpttr, LAPACKE_ctpttr_work, - LAPACKE_ctr_nancheck, - LAPACKE_ctr_trans, LAPACKE_ctrcon, LAPACKE_ctrcon_work, LAPACKE_ctrevc, @@ -964,7 +1082,6 @@ LAPACKE_cupgtr_work, LAPACKE_cupmtr, LAPACKE_cupmtr_work, - LAPACKE_d_nancheck, LAPACKE_dbbcsd, LAPACKE_dbbcsd_work, LAPACKE_dbdsdc, @@ -973,8 +1090,6 @@ LAPACKE_dbdsqr_work, LAPACKE_ddisna, LAPACKE_ddisna_work, - LAPACKE_dgb_nancheck, - LAPACKE_dgb_trans, LAPACKE_dgbbrd, LAPACKE_dgbbrd_work, LAPACKE_dgbcon, @@ -993,8 +1108,6 @@ LAPACKE_dgbtrf_work, LAPACKE_dgbtrs, LAPACKE_dgbtrs_work, - LAPACKE_dge_nancheck, - LAPACKE_dge_trans, LAPACKE_dgebak, LAPACKE_dgebak_work, LAPACKE_dgebal, @@ -1073,8 +1186,6 @@ LAPACKE_dgetri_work, LAPACKE_dgetrs, LAPACKE_dgetrs_work, - LAPACKE_dgg_nancheck, - LAPACKE_dgg_trans, LAPACKE_dggbak, LAPACKE_dggbak_work, LAPACKE_dggbal, @@ -1101,7 +1212,6 @@ LAPACKE_dggsvd_work, LAPACKE_dggsvp, LAPACKE_dggsvp_work, - LAPACKE_dgt_nancheck, LAPACKE_dgtcon, LAPACKE_dgtcon_work, LAPACKE_dgtrfs, @@ -1116,8 +1226,6 @@ LAPACKE_dgttrs_work, LAPACKE_dhgeqz, LAPACKE_dhgeqz_work, - LAPACKE_dhs_nancheck, - LAPACKE_dhs_trans, LAPACKE_dhsein, LAPACKE_dhsein_work, LAPACKE_dhseqr, @@ -1200,8 +1308,6 @@ LAPACKE_dormrz_work, LAPACKE_dormtr, LAPACKE_dormtr_work, - LAPACKE_dpb_nancheck, - LAPACKE_dpb_trans, LAPACKE_dpbcon, LAPACKE_dpbcon_work, LAPACKE_dpbequ, @@ -1218,16 +1324,12 @@ LAPACKE_dpbtrf_work, LAPACKE_dpbtrs, LAPACKE_dpbtrs_work, - LAPACKE_dpf_nancheck, - LAPACKE_dpf_trans, LAPACKE_dpftrf, LAPACKE_dpftrf_work, LAPACKE_dpftri, LAPACKE_dpftri_work, LAPACKE_dpftrs, LAPACKE_dpftrs_work, - LAPACKE_dpo_nancheck, - LAPACKE_dpo_trans, LAPACKE_dpocon, LAPACKE_dpocon_work, LAPACKE_dpoequ, @@ -1246,8 +1348,6 @@ LAPACKE_dpotri_work, LAPACKE_dpotrs, LAPACKE_dpotrs_work, - LAPACKE_dpp_nancheck, - LAPACKE_dpp_trans, LAPACKE_dppcon, LAPACKE_dppcon_work, LAPACKE_dppequ, @@ -1266,7 +1366,6 @@ LAPACKE_dpptrs_work, LAPACKE_dpstrf, LAPACKE_dpstrf_work, - LAPACKE_dpt_nancheck, LAPACKE_dptcon, LAPACKE_dptcon_work, LAPACKE_dpteqr, @@ -1281,8 +1380,6 @@ LAPACKE_dpttrf_work, LAPACKE_dpttrs, LAPACKE_dpttrs_work, - LAPACKE_dsb_nancheck, - LAPACKE_dsb_trans, LAPACKE_dsbev, LAPACKE_dsbev_work, LAPACKE_dsbevd, @@ -1303,8 +1400,6 @@ LAPACKE_dsfrk_work, LAPACKE_dsgesv, LAPACKE_dsgesv_work, - LAPACKE_dsp_nancheck, - LAPACKE_dsp_trans, LAPACKE_dspcon, LAPACKE_dspcon_work, LAPACKE_dspev, @@ -1337,7 +1432,6 @@ LAPACKE_dsptri_work, LAPACKE_dsptrs, LAPACKE_dsptrs_work, - LAPACKE_dst_nancheck, LAPACKE_dstebz, LAPACKE_dstebz_work, LAPACKE_dstedc, @@ -1360,8 +1454,6 @@ LAPACKE_dstevr_work, LAPACKE_dstevx, LAPACKE_dstevx_work, - LAPACKE_dsy_nancheck, - LAPACKE_dsy_trans, LAPACKE_dsycon, LAPACKE_dsycon_work, LAPACKE_dsyconv, @@ -1406,16 +1498,12 @@ LAPACKE_dsytrs2, LAPACKE_dsytrs2_work, LAPACKE_dsytrs_work, - LAPACKE_dtb_nancheck, - LAPACKE_dtb_trans, LAPACKE_dtbcon, LAPACKE_dtbcon_work, LAPACKE_dtbrfs, LAPACKE_dtbrfs_work, LAPACKE_dtbtrs, LAPACKE_dtbtrs_work, - LAPACKE_dtf_nancheck, - LAPACKE_dtf_trans, LAPACKE_dtfsm, LAPACKE_dtfsm_work, LAPACKE_dtftri, @@ -1436,8 +1524,6 @@ LAPACKE_dtgsna_work, LAPACKE_dtgsyl, LAPACKE_dtgsyl_work, - LAPACKE_dtp_nancheck, - LAPACKE_dtp_trans, LAPACKE_dtpcon, LAPACKE_dtpcon_work, LAPACKE_dtpmqrt, @@ -1458,8 +1544,6 @@ LAPACKE_dtpttf_work, LAPACKE_dtpttr, LAPACKE_dtpttr_work, - LAPACKE_dtr_nancheck, - LAPACKE_dtr_trans, LAPACKE_dtrcon, LAPACKE_dtrcon_work, LAPACKE_dtrevc, @@ -1484,8 +1568,6 @@ LAPACKE_dtrttp_work, LAPACKE_dtzrzf, LAPACKE_dtzrzf_work, - LAPACKE_lsame, - LAPACKE_s_nancheck, LAPACKE_sbbcsd, LAPACKE_sbbcsd_work, LAPACKE_sbdsdc, @@ -1494,8 +1576,6 @@ LAPACKE_sbdsqr_work, LAPACKE_sdisna, LAPACKE_sdisna_work, - LAPACKE_sgb_nancheck, - LAPACKE_sgb_trans, LAPACKE_sgbbrd, LAPACKE_sgbbrd_work, LAPACKE_sgbcon, @@ -1514,8 +1594,6 @@ LAPACKE_sgbtrf_work, LAPACKE_sgbtrs, LAPACKE_sgbtrs_work, - LAPACKE_sge_nancheck, - LAPACKE_sge_trans, LAPACKE_sgebak, LAPACKE_sgebak_work, LAPACKE_sgebal, @@ -1594,8 +1672,6 @@ LAPACKE_sgetri_work, LAPACKE_sgetrs, LAPACKE_sgetrs_work, - LAPACKE_sgg_nancheck, - LAPACKE_sgg_trans, LAPACKE_sggbak, LAPACKE_sggbak_work, LAPACKE_sggbal, @@ -1622,7 +1698,6 @@ LAPACKE_sggsvd_work, LAPACKE_sggsvp, LAPACKE_sggsvp_work, - LAPACKE_sgt_nancheck, LAPACKE_sgtcon, LAPACKE_sgtcon_work, LAPACKE_sgtrfs, @@ -1637,8 +1712,6 @@ LAPACKE_sgttrs_work, LAPACKE_shgeqz, LAPACKE_shgeqz_work, - LAPACKE_shs_nancheck, - LAPACKE_shs_trans, LAPACKE_shsein, LAPACKE_shsein_work, LAPACKE_shseqr, @@ -1721,8 +1794,6 @@ LAPACKE_sormrz_work, LAPACKE_sormtr, LAPACKE_sormtr_work, - LAPACKE_spb_nancheck, - LAPACKE_spb_trans, LAPACKE_spbcon, LAPACKE_spbcon_work, LAPACKE_spbequ, @@ -1739,16 +1810,12 @@ LAPACKE_spbtrf_work, LAPACKE_spbtrs, LAPACKE_spbtrs_work, - LAPACKE_spf_nancheck, - LAPACKE_spf_trans, LAPACKE_spftrf, LAPACKE_spftrf_work, LAPACKE_spftri, LAPACKE_spftri_work, LAPACKE_spftrs, LAPACKE_spftrs_work, - LAPACKE_spo_nancheck, - LAPACKE_spo_trans, LAPACKE_spocon, LAPACKE_spocon_work, LAPACKE_spoequ, @@ -1767,8 +1834,6 @@ LAPACKE_spotri_work, LAPACKE_spotrs, LAPACKE_spotrs_work, - LAPACKE_spp_nancheck, - LAPACKE_spp_trans, LAPACKE_sppcon, LAPACKE_sppcon_work, LAPACKE_sppequ, @@ -1787,7 +1852,6 @@ LAPACKE_spptrs_work, LAPACKE_spstrf, LAPACKE_spstrf_work, - LAPACKE_spt_nancheck, LAPACKE_sptcon, LAPACKE_sptcon_work, LAPACKE_spteqr, @@ -1802,8 +1866,6 @@ LAPACKE_spttrf_work, LAPACKE_spttrs, LAPACKE_spttrs_work, - LAPACKE_ssb_nancheck, - LAPACKE_ssb_trans, LAPACKE_ssbev, LAPACKE_ssbev_work, LAPACKE_ssbevd, @@ -1822,8 +1884,6 @@ LAPACKE_ssbtrd_work, LAPACKE_ssfrk, LAPACKE_ssfrk_work, - LAPACKE_ssp_nancheck, - LAPACKE_ssp_trans, LAPACKE_sspcon, LAPACKE_sspcon_work, LAPACKE_sspev, @@ -1854,7 +1914,6 @@ LAPACKE_ssptri_work, LAPACKE_ssptrs, LAPACKE_ssptrs_work, - LAPACKE_sst_nancheck, LAPACKE_sstebz, LAPACKE_sstebz_work, LAPACKE_sstedc, @@ -1877,8 +1936,6 @@ LAPACKE_sstevr_work, LAPACKE_sstevx, LAPACKE_sstevx_work, - LAPACKE_ssy_nancheck, - LAPACKE_ssy_trans, LAPACKE_ssycon, LAPACKE_ssycon_work, LAPACKE_ssyconv, @@ -1923,16 +1980,12 @@ LAPACKE_ssytrs2, LAPACKE_ssytrs2_work, LAPACKE_ssytrs_work, - LAPACKE_stb_nancheck, - LAPACKE_stb_trans, LAPACKE_stbcon, LAPACKE_stbcon_work, LAPACKE_stbrfs, LAPACKE_stbrfs_work, LAPACKE_stbtrs, LAPACKE_stbtrs_work, - LAPACKE_stf_nancheck, - LAPACKE_stf_trans, LAPACKE_stfsm, LAPACKE_stfsm_work, LAPACKE_stftri, @@ -1953,8 +2006,6 @@ LAPACKE_stgsna_work, LAPACKE_stgsyl, LAPACKE_stgsyl_work, - LAPACKE_stp_nancheck, - LAPACKE_stp_trans, LAPACKE_stpcon, LAPACKE_stpcon_work, LAPACKE_stpmqrt, @@ -1973,8 +2024,6 @@ LAPACKE_stpttf_work, LAPACKE_stpttr, LAPACKE_stpttr_work, - LAPACKE_str_nancheck, - LAPACKE_str_trans, LAPACKE_strcon, LAPACKE_strcon_work, LAPACKE_strevc, @@ -1999,8 +2048,6 @@ LAPACKE_strttp_work, LAPACKE_stzrzf, LAPACKE_stzrzf_work, - LAPACKE_xerbla, - LAPACKE_z_nancheck, LAPACKE_zbbcsd, LAPACKE_zbbcsd_work, LAPACKE_zbdsqr, @@ -2009,8 +2056,6 @@ LAPACKE_zcgesv_work, LAPACKE_zcposv, LAPACKE_zcposv_work, - LAPACKE_zgb_nancheck, - LAPACKE_zgb_trans, LAPACKE_zgbbrd, LAPACKE_zgbbrd_work, LAPACKE_zgbcon, @@ -2029,8 +2074,6 @@ LAPACKE_zgbtrf_work, LAPACKE_zgbtrs, LAPACKE_zgbtrs_work, - LAPACKE_zge_nancheck, - LAPACKE_zge_trans, LAPACKE_zgebak, LAPACKE_zgebak_work, LAPACKE_zgebal, @@ -2105,8 +2148,6 @@ LAPACKE_zgetri_work, LAPACKE_zgetrs, LAPACKE_zgetrs_work, - LAPACKE_zgg_nancheck, - LAPACKE_zgg_trans, LAPACKE_zggbak, LAPACKE_zggbak_work, LAPACKE_zggbal, @@ -2133,7 +2174,6 @@ LAPACKE_zggsvd_work, LAPACKE_zggsvp, LAPACKE_zggsvp_work, - LAPACKE_zgt_nancheck, LAPACKE_zgtcon, LAPACKE_zgtcon_work, LAPACKE_zgtrfs, @@ -2146,8 +2186,6 @@ LAPACKE_zgttrf_work, LAPACKE_zgttrs, LAPACKE_zgttrs_work, - LAPACKE_zhb_nancheck, - LAPACKE_zhb_trans, LAPACKE_zhbev, LAPACKE_zhbev_work, LAPACKE_zhbevd, @@ -2164,8 +2202,6 @@ LAPACKE_zhbgvx_work, LAPACKE_zhbtrd, LAPACKE_zhbtrd_work, - LAPACKE_zhe_nancheck, - LAPACKE_zhe_trans, LAPACKE_zhecon, LAPACKE_zhecon_work, LAPACKE_zheequb, @@ -2212,8 +2248,6 @@ LAPACKE_zhfrk_work, LAPACKE_zhgeqz, LAPACKE_zhgeqz_work, - LAPACKE_zhp_nancheck, - LAPACKE_zhp_trans, LAPACKE_zhpcon, LAPACKE_zhpcon_work, LAPACKE_zhpev, @@ -2244,8 +2278,6 @@ LAPACKE_zhptri_work, LAPACKE_zhptrs, LAPACKE_zhptrs_work, - LAPACKE_zhs_nancheck, - LAPACKE_zhs_trans, LAPACKE_zhsein, LAPACKE_zhsein_work, LAPACKE_zhseqr, @@ -2282,8 +2314,6 @@ LAPACKE_zlaswp_work, LAPACKE_zlauum, LAPACKE_zlauum_work, - LAPACKE_zpb_nancheck, - LAPACKE_zpb_trans, LAPACKE_zpbcon, LAPACKE_zpbcon_work, LAPACKE_zpbequ, @@ -2300,16 +2330,12 @@ LAPACKE_zpbtrf_work, LAPACKE_zpbtrs, LAPACKE_zpbtrs_work, - LAPACKE_zpf_nancheck, - LAPACKE_zpf_trans, LAPACKE_zpftrf, LAPACKE_zpftrf_work, LAPACKE_zpftri, LAPACKE_zpftri_work, LAPACKE_zpftrs, LAPACKE_zpftrs_work, - LAPACKE_zpo_nancheck, - LAPACKE_zpo_trans, LAPACKE_zpocon, LAPACKE_zpocon_work, LAPACKE_zpoequ, @@ -2328,8 +2354,6 @@ LAPACKE_zpotri_work, LAPACKE_zpotrs, LAPACKE_zpotrs_work, - LAPACKE_zpp_nancheck, - LAPACKE_zpp_trans, LAPACKE_zppcon, LAPACKE_zppcon_work, LAPACKE_zppequ, @@ -2348,7 +2372,6 @@ LAPACKE_zpptrs_work, LAPACKE_zpstrf, LAPACKE_zpstrf_work, - LAPACKE_zpt_nancheck, LAPACKE_zptcon, LAPACKE_zptcon_work, LAPACKE_zpteqr, @@ -2363,8 +2386,6 @@ LAPACKE_zpttrf_work, LAPACKE_zpttrs, LAPACKE_zpttrs_work, - LAPACKE_zsp_nancheck, - LAPACKE_zsp_trans, LAPACKE_zspcon, LAPACKE_zspcon_work, LAPACKE_zsprfs, @@ -2379,7 +2400,6 @@ LAPACKE_zsptri_work, LAPACKE_zsptrs, LAPACKE_zsptrs_work, - LAPACKE_zst_nancheck, LAPACKE_zstedc, LAPACKE_zstedc_work, LAPACKE_zstegr, @@ -2390,16 +2410,12 @@ LAPACKE_zstemr_work, LAPACKE_zsteqr, LAPACKE_zsteqr_work, - LAPACKE_zsy_nancheck, - LAPACKE_zsy_trans, LAPACKE_zsycon, LAPACKE_zsycon_work, LAPACKE_zsyconv, LAPACKE_zsyconv_work, LAPACKE_zsyequb, LAPACKE_zsyequb_work, - LAPACKE_zsyr, - LAPACKE_zsyr_work, LAPACKE_zsyrfs, LAPACKE_zsyrfs_work, LAPACKE_zsysv, @@ -2420,16 +2436,12 @@ LAPACKE_zsytrs2, LAPACKE_zsytrs2_work, LAPACKE_zsytrs_work, - LAPACKE_ztb_nancheck, - LAPACKE_ztb_trans, LAPACKE_ztbcon, LAPACKE_ztbcon_work, LAPACKE_ztbrfs, LAPACKE_ztbrfs_work, LAPACKE_ztbtrs, LAPACKE_ztbtrs_work, - LAPACKE_ztf_nancheck, - LAPACKE_ztf_trans, LAPACKE_ztfsm, LAPACKE_ztfsm_work, LAPACKE_ztftri, @@ -2450,8 +2462,6 @@ LAPACKE_ztgsna_work, LAPACKE_ztgsyl, LAPACKE_ztgsyl_work, - LAPACKE_ztp_nancheck, - LAPACKE_ztp_trans, LAPACKE_ztpcon, LAPACKE_ztpcon_work, LAPACKE_ztpmqrt, @@ -2472,8 +2482,6 @@ LAPACKE_ztpttf_work, LAPACKE_ztpttr, LAPACKE_ztpttr_work, - LAPACKE_ztr_nancheck, - LAPACKE_ztr_trans, LAPACKE_ztrcon, LAPACKE_ztrcon_work, LAPACKE_ztrevc, @@ -2536,6 +2544,118 @@ LAPACKE_zupgtr_work, LAPACKE_zupmtr, LAPACKE_zupmtr_work, + LAPACKE_zsyr, + LAPACKE_csyr, + LAPACKE_zsyr_work, + LAPACKE_csyr_work, + + ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` + ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the + ## corresponding LAPACK extended precision routines. + #LAPACKE_cgbrfsx, + #LAPACKE_cporfsx, + #LAPACKE_dgerfsx, + #LAPACKE_sgbrfsx, + #LAPACKE_ssyrfsx, + #LAPACKE_zherfsx, + #LAPACKE_cgbrfsx_work, + #LAPACKE_cporfsx_work, + #LAPACKE_dgerfsx_work, + #LAPACKE_sgbrfsx_work, + #LAPACKE_ssyrfsx_work, + #LAPACKE_zherfsx_work, + #LAPACKE_cgerfsx, + #LAPACKE_csyrfsx, + #LAPACKE_dporfsx, + #LAPACKE_sgerfsx, + #LAPACKE_zgbrfsx, + #LAPACKE_zporfsx, + #LAPACKE_cgerfsx_work, + #LAPACKE_csyrfsx_work, + #LAPACKE_dporfsx_work, + #LAPACKE_sgerfsx_work, + #LAPACKE_zgbrfsx_work, + #LAPACKE_zporfsx_work, + #LAPACKE_cherfsx, + #LAPACKE_dgbrfsx, + #LAPACKE_dsyrfsx, + #LAPACKE_sporfsx, + #LAPACKE_zgerfsx, + #LAPACKE_zsyrfsx, + #LAPACKE_cherfsx_work, + #LAPACKE_dgbrfsx_work, + #LAPACKE_dsyrfsx_work, + #LAPACKE_sporfsx_work, + #LAPACKE_zgerfsx_work, + #LAPACKE_zsyrfsx_work, + #LAPACKE_cgbsvxx, + #LAPACKE_cposvxx, + #LAPACKE_dgesvxx, + #LAPACKE_sgbsvxx, + #LAPACKE_ssysvxx, + #LAPACKE_zhesvxx, + #LAPACKE_cgbsvxx_work, + #LAPACKE_cposvxx_work, + #LAPACKE_dgesvxx_work, + #LAPACKE_sgbsvxx_work, + #LAPACKE_ssysvxx_work, + #LAPACKE_zhesvxx_work, + #LAPACKE_cgesvxx, + #LAPACKE_csysvxx, + #LAPACKE_dposvxx, + #LAPACKE_sgesvxx, + #LAPACKE_zgbsvxx, + #LAPACKE_zposvxx, + #LAPACKE_cgesvxx_work, + #LAPACKE_csysvxx_work, + #LAPACKE_dposvxx_work, + #LAPACKE_sgesvxx_work, + #LAPACKE_zgbsvxx_work, + #LAPACKE_zposvxx_work, + #LAPACKE_chesvxx, + #LAPACKE_dgbsvxx, + #LAPACKE_dsysvxx, + #LAPACKE_sposvxx, + #LAPACKE_zgesvxx, + #LAPACKE_zsysvxx, + #LAPACKE_chesvxx_work, + #LAPACKE_dgbsvxx_work, + #LAPACKE_dsysvxx_work, + #LAPACKE_sposvxx_work, + #LAPACKE_zgesvxx_work, + #LAPACKE_zsysvxx_work, + + ## @(MATGEN_OBJ) from `lapack-3.4.1/lapacke/src/Makefile` + ## Not exported: requires LAPACKE_TESTING to be set and depends on libtmg + ## (see `lapack-3.4.1/TESTING/MATGEN`). + #LAPACKE_clatms, + #LAPACKE_clatms_work, + #LAPACKE_dlatms, + #LAPACKE_dlatms_work, + #LAPACKE_slatms, + #LAPACKE_slatms_work, + #LAPACKE_zlatms, + #LAPACKE_zlatms_work, + #LAPACKE_clagge, + #LAPACKE_clagge_work, + #LAPACKE_dlagge, + #LAPACKE_dlagge_work, + #LAPACKE_slagge, + #LAPACKE_slagge_work, + #LAPACKE_zlagge, + #LAPACKE_zlagge_work, + #LAPACKE_claghe, + #LAPACKE_claghe_work, + #LAPACKE_zlaghe, + #LAPACKE_zlaghe_work, + #LAPACKE_clagsy, + #LAPACKE_clagsy_work, + #LAPACKE_dlagsy, + #LAPACKE_dlagsy_work, + #LAPACKE_slagsy, + #LAPACKE_slagsy_work, + #LAPACKE_zlagsy, + #LAPACKE_zlagsy_work, ); if ($ARGV[5] == 1) { From be1692d64fdaaee300f81f8594f64edb512ff39f Mon Sep 17 00:00:00 2001 From: Mike Nolta Date: Sun, 20 May 2012 00:49:38 -0300 Subject: [PATCH 11/46] fix 'sched_yield' warnings on FreeBSD,NetBSD --- common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common.h b/common.h index c6d30ddcf..0868b594a 100644 --- a/common.h +++ b/common.h @@ -89,7 +89,7 @@ extern "C" { #include #endif -#ifdef OS_DARWIN +#if defined(OS_DARWIN) || defined(OS_FREEBSD) || defined(OS_NETBSD) #include #endif From 14c3511e9271b06d57a7a3777dbe16b3717a48b7 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sun, 20 May 2012 18:09:35 +0200 Subject: [PATCH 12/46] Respect C compiler set on the command line or inherited from the environment --- Makefile.system | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/Makefile.system b/Makefile.system index c9e74faa6..c72326ed5 100644 --- a/Makefile.system +++ b/Makefile.system @@ -14,7 +14,15 @@ NETLIB_LAPACK_DIR = $(TOPDIR)/lapack-3.4.1 endif # Default C compiler +# - Only set if not specified on the command line or inherited from the environment. +# - CC is an implicit variable so neither '?=' or 'ifndef' can be used. +# http://stackoverflow.com/questions/4029274/mingw-and-make-variables +# - Default value is 'cc' which is not always a valid command (e.g. MinGW). +ifeq ($(origin CC),default) CC = gcc +endif + +# Default Fortran compiler (FC) is selected by f_check. ifndef MAKEFILE_RULE include $(TOPDIR)/Makefile.rule From 44124d3055fe09449ca591fad2db22a20a01d252 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Sun, 20 May 2012 18:11:34 +0200 Subject: [PATCH 13/46] Fix Fortran compiler detection - Test with '-x' operator to ensure file is executable. - 'break' is not a valid Perl keyword. --- f_check | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/f_check b/f_check index 93c39ec88..8e3855b10 100644 --- a/f_check +++ b/f_check @@ -32,11 +32,12 @@ if ($compiler eq "") { "pgf95", "pgf90", "pgf77", "ifort"); +OUTER: foreach $lists (@lists) { foreach $path (@path) { - if (-f $path . "/" . $lists) { + if (-x $path . "/" . $lists) { $compiler = $lists; - break; + last OUTER; } } } From e9be1fdd2bf373800c1cf3a5217b09a018284b21 Mon Sep 17 00:00:00 2001 From: Mike Nolta Date: Sun, 20 May 2012 21:44:15 -0300 Subject: [PATCH 14/46] FreeBSD: replace EXTRALIB -> FEXTRALIB --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index 971bd0bed..83f2f5d0c 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -121,7 +121,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest From 10e25690b4591a4a25b7963fca27a899efc658cf Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Mon, 21 May 2012 12:10:26 +0200 Subject: [PATCH 15/46] Fix FreeBSD build (undefined reference to `pthread_create') --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index 83f2f5d0c..971bd0bed 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -121,7 +121,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) + -Wl,--retain-symbols-file=linux.def $(EXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest From a27339b24443fa57d1afa09f2d7ecfa7757e1f42 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Mon, 21 May 2012 12:25:12 +0200 Subject: [PATCH 16/46] DLL: replace FEXTRALIB -> EXTRALIB (for consistency) --- exports/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/exports/Makefile b/exports/Makefile index 971bd0bed..b50b521f1 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -70,11 +70,11 @@ dll2 : libgoto2_shared.dll $(RANLIB) ../$(LIBNAME) ifeq ($(BINARY32), 1) $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ - --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + --entry _dllinit@12 -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) -lib /machine:i386 /def:libopenblas.def else $(DLLWRAP) -o ../$(LIBDLLNAME) --def libopenblas.def \ - --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(FEXTRALIB) + --entry $(FU)dllinit -s dllinit.$(SUFFIX) --dllname $(@F) ../$(LIBNAME) $(EXTRALIB) -lib /machine:X64 /def:libopenblas.def endif From 839b18aa260a4443f9b13615cb583c2f08af79b1 Mon Sep 17 00:00:00 2001 From: Mike Nolta Date: Mon, 21 May 2012 16:56:28 -0400 Subject: [PATCH 17/46] FreeBSD: allow CC & FC to have different versions --- exports/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/exports/Makefile b/exports/Makefile index b50b521f1..40a3a7c63 100644 --- a/exports/Makefile +++ b/exports/Makefile @@ -121,7 +121,7 @@ so : ../$(LIBSONAME) ../$(LIBSONAME) : ../$(LIBNAME) linux.def linktest.c $(CC) $(CFLAGS) -shared -o ../$(LIBSONAME) \ -Wl,--whole-archive ../$(LIBNAME) -Wl,--no-whole-archive \ - -Wl,--retain-symbols-file=linux.def $(EXTRALIB) + -Wl,--retain-symbols-file=linux.def $(FEXTRALIB) $(EXTRALIB) $(CC) $(CFLAGS) -w -o linktest linktest.c ../$(LIBSONAME) $(FEXTRALIB) && echo OK. rm -f linktest From 4e29b6ffc0a8e7c748975f44194098ad3d229f14 Mon Sep 17 00:00:00 2001 From: Mike Nolta Date: Mon, 21 May 2012 16:57:19 -0400 Subject: [PATCH 18/46] FreeBSD: fix OS_FreeBSD -> OS_FREEBSD typos --- common_x86.h | 2 +- common_x86_64.h | 2 +- driver/others/memory.c | 10 +++++----- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/common_x86.h b/common_x86.h index fbb91f888..eaf395806 100644 --- a/common_x86.h +++ b/common_x86.h @@ -282,7 +282,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NetBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ diff --git a/common_x86_64.h b/common_x86_64.h index 53b702185..735c9b294 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -353,7 +353,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FreeBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ diff --git a/driver/others/memory.c b/driver/others/memory.c index 3f1a5f60a..9a925d290 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_Darwin) #include #endif @@ -185,7 +185,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_Darwin) int get_num_procs(void) { @@ -215,7 +215,7 @@ int goto_get_num_procs (void) { int blas_get_cpu_number(void){ char *p; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) int max_num; #endif int blas_goto_num = 0; @@ -223,7 +223,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) max_num = get_num_procs(); #endif @@ -250,7 +250,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FreeBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From a431042475d414d8b786804f351467a33d24f1ae Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Wed, 23 May 2012 00:01:14 +0200 Subject: [PATCH 19/46] Fix inconsistent case for OS_* macros (Refs pull request #111) --- c_check | 8 ++++---- common.h | 2 +- common_x86.h | 2 +- common_x86_64.h | 2 +- ctest.c | 8 ++++---- driver/others/memory.c | 10 +++++----- 6 files changed, 16 insertions(+), 16 deletions(-) diff --git a/c_check b/c_check index 6ce5e4cc0..b4105d19a 100644 --- a/c_check +++ b/c_check @@ -43,10 +43,10 @@ $compiler = DEC if ($data =~ /COMPILER_DEC/); $compiler = GCC if ($compiler eq ""); $os = Linux if ($data =~ /OS_LINUX/); -$os = FreeBSD if ($data =~ /OS_FreeBSD/); -$os = NetBSD if ($data =~ /OS_NetBSD/); -$os = Darwin if ($data =~ /OS_Darwin/); -$os = SunOS if ($data =~ /OS_SunOS/); +$os = FreeBSD if ($data =~ /OS_FREEBSD/); +$os = NetBSD if ($data =~ /OS_NETBSD/); +$os = Darwin if ($data =~ /OS_DARWIN/); +$os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); $os = osf if ($data =~ /OS_OSF/); $os = WINNT if ($data =~ /OS_WINNT/); diff --git a/common.h b/common.h index 0868b594a..3718cdee4 100644 --- a/common.h +++ b/common.h @@ -68,7 +68,7 @@ extern "C" { #define SMP #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_Interix) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define WINDOWS_ABI #define OS_WINDOWS diff --git a/common_x86.h b/common_x86.h index eaf395806..468fc55eb 100644 --- a/common_x86.h +++ b/common_x86.h @@ -282,7 +282,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NetBSD) || defined(__ELF__) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) #define PROLOGUE \ .text; \ .align 16; \ diff --git a/common_x86_64.h b/common_x86_64.h index 735c9b294..2dc788c93 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -353,7 +353,7 @@ REALNAME: #define EPILOGUE .end REALNAME #endif -#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NetBSD) || defined(__ELF__) || defined(C_PGI) +#if defined(OS_LINUX) || defined(OS_FREEBSD) || defined(OS_NETBSD) || defined(__ELF__) || defined(C_PGI) #define PROLOGUE \ .text; \ .align 512; \ diff --git a/ctest.c b/ctest.c index 0c373bf2b..ac8283898 100644 --- a/ctest.c +++ b/ctest.c @@ -35,19 +35,19 @@ OS_LINUX #endif #if defined(__FreeBSD__) -OS_FreeBSD +OS_FREEBSD #endif #if defined(__NetBSD__) -OS_NetBSD +OS_NETBSD #endif #if defined(__sun) -OS_SunOS +OS_SUNOS #endif #if defined(__APPLE__) -OS_Darwin +OS_DARWIN #endif #if defined(_AIX) diff --git a/driver/others/memory.c b/driver/others/memory.c index 9a925d290..9b8863f39 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -103,7 +103,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #endif -#if defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_DARWIN) #include #endif @@ -185,7 +185,7 @@ int get_num_procs(void) { #endif -#if defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_FREEBSD) || defined(OS_DARWIN) int get_num_procs(void) { @@ -215,7 +215,7 @@ int goto_get_num_procs (void) { int blas_get_cpu_number(void){ char *p; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) int max_num; #endif int blas_goto_num = 0; @@ -223,7 +223,7 @@ int blas_get_cpu_number(void){ if (blas_num_threads) return blas_num_threads; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) max_num = get_num_procs(); #endif @@ -250,7 +250,7 @@ int blas_get_cpu_number(void){ else if (blas_omp_num > 0) blas_num_threads = blas_omp_num; else blas_num_threads = MAX_CPU_NUMBER; -#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_Darwin) +#if defined(OS_LINUX) || defined(OS_WINDOWS) || defined(OS_FREEBSD) || defined(OS_DARWIN) if (blas_num_threads > max_num) blas_num_threads = max_num; #endif From 5199809bba04ebcd176f29d8769285a64b364c08 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Wed, 23 May 2012 00:04:04 +0200 Subject: [PATCH 20/46] Fix typo: OS_CYGWIN -> OS_CYGWIN_NT, OS_INERIX -> OS_INTERIX --- c_check | 2 +- common_x86.h | 4 ++-- ctest.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/c_check b/c_check index b4105d19a..4d82237d4 100644 --- a/c_check +++ b/c_check @@ -50,7 +50,7 @@ $os = SunOS if ($data =~ /OS_SUNOS/); $os = AIX if ($data =~ /OS_AIX/); $os = osf if ($data =~ /OS_OSF/); $os = WINNT if ($data =~ /OS_WINNT/); -$os = CYGWIN_NT if ($data =~ /OS_CYGWIN/); +$os = CYGWIN_NT if ($data =~ /OS_CYGWIN_NT/); $os = Interix if ($data =~ /OS_INTERIX/); $architecture = x86 if ($data =~ /ARCH_X86/); diff --git a/common_x86.h b/common_x86.h index 468fc55eb..8f1a0308d 100644 --- a/common_x86.h +++ b/common_x86.h @@ -254,7 +254,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define PROFCODE #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define SAVEREGISTERS \ subl $32, %esp;\ movups %xmm6, 0(%esp);\ @@ -269,7 +269,7 @@ static __inline int blas_quickdivide(unsigned int x, unsigned int y){ #define RESTOREREGISTERS #endif -#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INERIX) +#if defined(OS_WINNT) || defined(OS_CYGWIN_NT) || defined(OS_INTERIX) #define PROLOGUE \ .text; \ .align 16; \ diff --git a/ctest.c b/ctest.c index ac8283898..9fc0b0c40 100644 --- a/ctest.c +++ b/ctest.c @@ -63,7 +63,7 @@ OS_WINNT #endif #if defined(__CYGWIN__) -OS_CYGWIN +OS_CYGWIN_NT #endif #if defined(__INTERIX) From 8cc7f86cf7fd66ddf8f015e57c45315ae40daa17 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Fri, 25 May 2012 23:20:29 +0800 Subject: [PATCH 21/46] Detect Intel Core i7 3000 with Sandybridge. --- cpuid_x86.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpuid_x86.c b/cpuid_x86.c index 53b6f356c..0b9b5b6e6 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -990,7 +990,7 @@ int get_cpuname(void){ return CPUTYPE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) - return CPUTYPE_NEHALEM; + return CPUTYPE_SANDYBRIDGE; case 15: //Xeon Processor E7 (Westmere-EX) return CPUTYPE_NEHALEM; @@ -1332,7 +1332,7 @@ int get_coretype(void){ return CORE_NEHALEM; case 13: //Intel Core i7-3000 / Xeon E5 (Sandy Bridge) - return CORE_NEHALEM; + return CORE_SANDYBRIDGE; case 15: //Xeon Processor E7 (Westmere-EX) return CORE_NEHALEM; From a6adbb299da0726eddaf95d4b32da8c5d0616227 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 29 May 2012 14:01:50 +0800 Subject: [PATCH 22/46] Refs #112. Improved setting thread affinity in Linux. Remove the limit (64) about the number of CPU cores. --- driver/others/init.c | 239 ++++++++++++++++++++++++++++++------------- 1 file changed, 167 insertions(+), 72 deletions(-) diff --git a/driver/others/init.c b/driver/others/init.c index 4adba661f..4a6f0aae8 100644 --- a/driver/others/init.c +++ b/driver/others/init.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -85,6 +85,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MAX_NODES 16 #define MAX_CPUS 256 +#define NCPUBITS (8*sizeof(unsigned long)) +#define MAX_BITMASK_LEN (MAX_CPUS/NCPUBITS) +#define CPUELT(cpu) ((cpu) / NCPUBITS) +#define CPUMASK(cpu) ((unsigned long) 1UL << ((cpu) % NCPUBITS)) + #define SH_MAGIC 0x510510 @@ -103,10 +108,10 @@ typedef struct { int num_nodes; int num_procs; int final_num_procs; - unsigned long avail; - + unsigned long avail [MAX_BITMASK_LEN]; + int avail_count; unsigned long cpu_info [MAX_CPUS]; - unsigned long node_info [MAX_NODES]; + unsigned long node_info [MAX_NODES][MAX_BITMASK_LEN]; int cpu_use[MAX_CPUS]; } shm_t; @@ -126,7 +131,8 @@ static shm_t *common = (void *)-1; static int shmid, pshmid; static void *paddr; -static unsigned long lprocmask, lnodemask; +static unsigned long lprocmask[MAX_BITMASK_LEN], lnodemask; +static int lprocmask_count = 0; static int numprocs = 1; static int numnodes = 1; @@ -177,70 +183,114 @@ static inline int rcount(unsigned long number) { than sizeof(unsigned long). On 64 bits, the limit is 64. On 32 bits, it is 32. ***/ -static inline unsigned long get_cpumap(int node) { +static inline void get_cpumap(int node, unsigned long * node_info) { int infile; - unsigned long affinity; + unsigned long affinity[32]; char name[160]; char cpumap[160]; - char *p, *dummy; + char *dummy; int i=0; + int count=0; + int k=0; sprintf(name, CPUMAP_NAME, node); infile = open(name, O_RDONLY); + for(i=0; i<32; i++){ + affinity[i] = 0; + } - affinity = 0; - if (infile != -1) { read(infile, cpumap, sizeof(cpumap)); - p = cpumap; - while (*p != '\n' && i<160){ - if(*p != ',') { - name[i++]=*p; + + for(i=0; i<160; i++){ + if(cpumap[i] == '\n') + break; + if(cpumap[i] != ','){ + name[k++]=cpumap[i]; + + //Enough data for Hex + if(k >= NCPUBITS/4){ + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } } - p++; + + } + if(k!=0){ + name[k]='\0'; + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... + // revert the sequence + for(i=0; i= NCPUBITS/4){ + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + } + + } + if(k!=0){ + name[k]='\0'; + affinity[count++] = strtoul(name, &dummy, 16); + k=0; + } + // 0-63bit -> node_info[0], 64-128bit -> node_info[1] .... + // revert the sequence + for(i=0; i num_nodes = 0; @@ -258,7 +309,9 @@ static int numa_check(void) { return 0; } - for (node = 0; node < MAX_NODES; node ++) common -> node_info[node] = 0; + for (node = 0; node < MAX_NODES; node ++) { + for (j = 0; j node_info[node][j] = 0; + } while ((dir = readdir(dp)) != NULL) { if (*(unsigned int *) dir -> d_name == 0x065646f6eU) { @@ -266,12 +319,12 @@ static int numa_check(void) { node = atoi(&dir -> d_name[4]); if (node > MAX_NODES) { - fprintf(stderr, "\nGotoBLAS Warining : MAX_NODES (NUMA) is too small. Terminated.\n"); + fprintf(stderr, "\nOpenBLAS Warning : MAX_NODES (NUMA) is too small. Terminated.\n"); exit(1); } common -> num_nodes ++; - common -> node_info[node] = get_cpumap(node); + get_cpumap(node, common->node_info[node]); } } @@ -284,7 +337,7 @@ static int numa_check(void) { fprintf(stderr, "Numa found : number of Nodes = %2d\n", common -> num_nodes); for (node = 0; node < common -> num_nodes; node ++) - fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node]); + fprintf(stderr, "MASK (%2d) : %08lx\n", node, common -> node_info[node][0]); #endif return common -> num_nodes; @@ -296,11 +349,13 @@ static void numa_mapping(void) { int i, j, h; unsigned long work, bit; int count = 0; + int bitmask_idx = 0; for (node = 0; node < common -> num_nodes; node ++) { core = 0; for (cpu = 0; cpu < common -> num_procs; cpu ++) { - if (common -> node_info[node] & common -> avail & (1UL << cpu)) { + bitmask_idx = CPUELT(cpu); + if (common -> node_info[node][bitmask_idx] & common -> avail[bitmask_idx] & CPUMASK(cpu)) { common -> cpu_info[count] = WRITE_CORE(core) | WRITE_NODE(node) | WRITE_CPU(cpu); count ++; core ++; @@ -357,58 +412,89 @@ static void numa_mapping(void) { static void disable_hyperthread(void) { - unsigned long share; + unsigned long share[MAX_BITMASK_LEN]; int cpu; + int bitmask_idx = 0; + int i=0, count=0; + bitmask_idx = CPUELT(common -> num_procs); - if(common->num_procs > 64){ - fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); - exit(1); - }else if(common->num_procs == 64){ - common -> avail = 0xFFFFFFFFFFFFFFFFUL; - }else - common -> avail = (1UL << common -> num_procs) - 1; + for(i=0; i< bitmask_idx; i++){ + common -> avail[count++] = 0xFFFFFFFFFFFFFFFFUL; + } + if(CPUMASK(common -> num_procs) != 1){ + common -> avail[count++] = CPUMASK(common -> num_procs) - 1; + } + common -> avail_count = count; + + /* if(common->num_procs > 64){ */ + /* fprintf(stderr, "\nOpenBLAS Warning : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->num_procs); */ + /* exit(1); */ + /* }else if(common->num_procs == 64){ */ + /* common -> avail = 0xFFFFFFFFFFFFFFFFUL; */ + /* }else */ + /* common -> avail = (1UL << common -> num_procs) - 1; */ #ifdef DEBUG - fprintf(stderr, "\nAvail CPUs : %04lx.\n", common -> avail); + fprintf(stderr, "\nAvail CPUs : "); + for(i=0; i avail[i]); + fprintf(stderr, ".\n"); #endif for (cpu = 0; cpu < common -> num_procs; cpu ++) { - - share = (get_share(cpu, 1) & common -> avail); - - if (popcount(share) > 1) { + + get_share(cpu, 1, share); + + //When the shared cpu are in different element of share & avail array, this may be a bug. + for (i = 0; i < count ; i++){ + if (popcount(share[i]) > 1) { #ifdef DEBUG - fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", - cpu, share & ~(1UL << cpu)); + fprintf(stderr, "Detected Hyper Threading on CPU %4x; disabled CPU %04lx.\n", + cpu, share[i] & ~(CPUMASK(cpu))); #endif - common -> avail &= ~((share & ~(1UL << cpu))); + common -> avail[i] &= ~((share[i] & ~ CPUMASK(cpu))); + } } } } static void disable_affinity(void) { - + int i=0; + int bitmask_idx=0; + int count=0; #ifdef DEBUG - fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail); + fprintf(stderr, "Final all available CPUs : %04lx.\n\n", common -> avail[0]); fprintf(stderr, "CPU mask : %04lx.\n\n", *(unsigned long *)&cpu_orig_mask[0]); #endif - if(common->final_num_procs > 64){ - fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); - exit(1); - }else if(common->final_num_procs == 64){ - lprocmask = 0xFFFFFFFFFFFFFFFFUL; - }else - lprocmask = (1UL << common -> final_num_procs) - 1; + /* if(common->final_num_procs > 64){ */ + /* fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(64). Terminated.\n", common->final_num_procs); */ + /* exit(1); */ + /* }else if(common->final_num_procs == 64){ */ + /* lprocmask = 0xFFFFFFFFFFFFFFFFUL; */ + /* }else */ + /* lprocmask = (1UL << common -> final_num_procs) - 1; */ + + bitmask_idx = CPUELT(common -> final_num_procs); + + for(i=0; i< bitmask_idx; i++){ + lprocmask[count++] = 0xFFFFFFFFFFFFFFFFUL; + } + if(CPUMASK(common -> final_num_procs) != 1){ + lprocmask[count++] = CPUMASK(common -> final_num_procs) - 1; + } + lprocmask_count = count; #ifndef USE_OPENMP - lprocmask &= *(unsigned long *)&cpu_orig_mask[0]; + for(i=0; i< count; i++){ + lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; + } #endif #ifdef DEBUG - fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask); + fprintf(stderr, "I choose these CPUs : %04lx.\n\n", lprocmask[0]); #endif } @@ -498,7 +584,7 @@ static void create_pshmem(void) { static void local_cpu_map(void) { int cpu, id, mapping; - + int bitmask_idx = 0; cpu = 0; mapping = 0; @@ -508,8 +594,9 @@ static void local_cpu_map(void) { if (id > 0) { if (is_dead(id)) common -> cpu_use[cpu] = 0; } - - if ((common -> cpu_use[cpu] == 0) && (lprocmask & (1UL << cpu))) { + + bitmask_idx = CPUELT(cpu); + if ((common -> cpu_use[cpu] == 0) && (lprocmask[bitmask_idx] & CPUMASK(cpu))) { common -> cpu_use[cpu] = pshmid; cpu_mapping[mapping] = READ_CPU(common -> cpu_info[cpu]); @@ -595,6 +682,7 @@ void gotoblas_affinity_init(void) { #ifndef USE_OPENMP cpu_set_t cpu_mask; #endif + int i; if (initialized) return; @@ -646,6 +734,11 @@ void gotoblas_affinity_init(void) { common -> num_procs = get_nprocs(); + if(common -> num_procs > MAX_CPUS) { + fprintf(stderr, "\nOpenBLAS Warining : The number of CPU/Cores(%d) is beyond the limit(%d). Terminated.\n", common->num_procs, MAX_CPUS); + exit(1); + } + for (cpu = 0; cpu < common -> num_procs; cpu++) common -> cpu_info[cpu] = cpu; numa_check(); @@ -654,7 +747,8 @@ void gotoblas_affinity_init(void) { if (common -> num_nodes > 1) numa_mapping(); - common -> final_num_procs = popcount(common -> avail); + common -> final_num_procs = 0; + for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0; @@ -664,7 +758,8 @@ void gotoblas_affinity_init(void) { disable_affinity(); - num_avail = popcount(lprocmask); + num_avail = 0; + for(i=0; i num_avail)) numprocs = num_avail; From a4daa34db77dd7410bd710be99cc22dd9dc5a5ce Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 30 May 2012 20:25:01 +0800 Subject: [PATCH 23/46] Refs #75. Use ffreep opcode directly. Please check out http://www.sandpile.org/x86/opc_fpu.htm . --- common_x86.h | 5 +++++ common_x86_64.h | 4 ++++ 2 files changed, 9 insertions(+) diff --git a/common_x86.h b/common_x86.h index 8f1a0308d..4c17f3a04 100644 --- a/common_x86.h +++ b/common_x86.h @@ -356,4 +356,9 @@ REALNAME: #ifndef ALIGN_6 #define ALIGN_6 .align 64 + +// ffreep %st(0). +// Because Clang didn't support ffreep, we directly use the opcode. +// Please check out http://www.sandpile.org/x86/opc_fpu.htm +#define ffreep .byte 0xdf, 0xc0 # #endif diff --git a/common_x86_64.h b/common_x86_64.h index 2dc788c93..e61e37e6b 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -448,4 +448,8 @@ REALNAME: #define ALIGN_6 .align 64 #endif +// ffreep %st(0). +// Because Clang didn't support ffreep, we directly use the opcode. +// Please check out http://www.sandpile.org/x86/opc_fpu.htm +#define ffreep .byte 0xdf, 0xc0 # #endif From 37edae1c90c01d65e47ff57b3f98d6bedbfc766b Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 31 May 2012 17:17:02 +0800 Subject: [PATCH 24/46] Refs #75. Check ffreep macro before the define. --- common_x86.h | 2 ++ common_x86_64.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/common_x86.h b/common_x86.h index 4c17f3a04..4316318ec 100644 --- a/common_x86.h +++ b/common_x86.h @@ -360,5 +360,7 @@ REALNAME: // ffreep %st(0). // Because Clang didn't support ffreep, we directly use the opcode. // Please check out http://www.sandpile.org/x86/opc_fpu.htm +#ifndef ffreep #define ffreep .byte 0xdf, 0xc0 # #endif +#endif diff --git a/common_x86_64.h b/common_x86_64.h index e61e37e6b..7b6d11f7d 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -451,5 +451,7 @@ REALNAME: // ffreep %st(0). // Because Clang didn't support ffreep, we directly use the opcode. // Please check out http://www.sandpile.org/x86/opc_fpu.htm +#ifndef ffreep #define ffreep .byte 0xdf, 0xc0 # #endif +#endif From d6cab3f37ecab53d562e931ef358934940ac22d3 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 31 May 2012 18:17:45 +0800 Subject: [PATCH 25/46] Refs #113. Support AMD Bobcate using Barcelona kernel codes. Replace 3DNow! with MMX. --- Makefile.system | 4 +- TargetList.txt | 1 + cpuid.h | 2 + cpuid_x86.c | 10 ++++- driver/others/parameter.c | 4 +- getarch.c | 18 +++++++- kernel/setparam-ref.c | 16 +++++++ kernel/x86/KERNEL.BOBCATE | 59 +++++++++++++++++++++++++ kernel/x86/trsm_kernel_LN_2x4_sse2.S | 10 ++--- kernel/x86/trsm_kernel_LN_4x4_sse.S | 22 +++++----- kernel/x86/trsm_kernel_LT_2x4_sse2.S | 10 ++--- kernel/x86/trsm_kernel_LT_4x4_sse.S | 22 +++++----- kernel/x86/trsm_kernel_RT_2x4_sse2.S | 10 ++--- kernel/x86/trsm_kernel_RT_4x4_sse.S | 22 +++++----- kernel/x86/ztrsm_kernel_LN_2x2_sse.S | 4 +- kernel/x86/ztrsm_kernel_LT_2x2_sse.S | 4 +- kernel/x86/ztrsm_kernel_RT_2x2_sse.S | 4 +- kernel/x86_64/KERNEL.BOBCATE | 62 +++++++++++++++++++++++++++ kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- l1param.h | 7 +++ l2param.h | 2 +- param.h | 64 +++++++++++++++++++++++++++- 29 files changed, 303 insertions(+), 70 deletions(-) create mode 100644 kernel/x86/KERNEL.BOBCATE create mode 100644 kernel/x86_64/KERNEL.BOBCATE diff --git a/Makefile.system b/Makefile.system index e2c908e98..987bb83cf 100644 --- a/Makefile.system +++ b/Makefile.system @@ -247,11 +247,11 @@ endif ifdef DYNAMIC_ARCH ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO endif ifndef DYNAMIC_CORE diff --git a/TargetList.txt b/TargetList.txt index 9e0db4866..19008b862 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -28,6 +28,7 @@ OPTERON_SSE3 BARCELONA SHANGHAI ISTANBUL +BOBCATE c)VIA CPU: SSE_GENERIC diff --git a/cpuid.h b/cpuid.h index c0f21698d..1678d0a7e 100644 --- a/cpuid.h +++ b/cpuid.h @@ -104,6 +104,7 @@ #define CORE_ATOM 18 #define CORE_NANO 19 #define CORE_SANDYBRIDGE 20 +#define CORE_BOBCATE 21 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -191,4 +192,5 @@ typedef struct { #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 #define CPUTYPE_SANDYBRIDGE 44 +#define CPUTYPE_BOBCATE 45 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index 0b9b5b6e6..d31146a98 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1028,6 +1028,8 @@ int get_cpuname(void){ case 1: case 10: return CPUTYPE_BARCELONA; + case 5: + return CPUTYPE_BOBCATE; } break; } @@ -1148,6 +1150,7 @@ static char *cpuname[] = { "VIAC3", "NANO", "SANDYBRIDGE", + "BOBCATE", }; static char *lowercpuname[] = { @@ -1195,6 +1198,7 @@ static char *lowercpuname[] = { "nsgeode", "nano", "sandybridge", + "bobcate", }; static char *corename[] = { @@ -1219,6 +1223,7 @@ static char *corename[] = { "ATOM", "NANO", "SANDYBRIDGE", + "BOBCATE", }; static char *corename_lower[] = { @@ -1243,6 +1248,7 @@ static char *corename_lower[] = { "atom", "nano", "sandybridge", + "bobcate", }; @@ -1351,7 +1357,9 @@ int get_coretype(void){ if (family <= 0x5) return CORE_80486; if (family <= 0xe) return CORE_ATHLON; if (family == 0xf){ - if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; else return CORE_BARCELONA; + if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; + else if (exfamily == 5) return CORE_BOBCATE; + else return CORE_BARCELONA; } } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 5ff1f2934..ab90b89f0 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -163,7 +163,7 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; -#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || \ +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) @@ -446,7 +446,7 @@ void blas_set_parameter(void){ #endif #endif -#if defined(CORE_BARCELONA) +#if defined(CORE_BARCELONA) || defined(CORE_BOBCATE) size >>= 8; sgemm_p = 232 * size; diff --git a/getarch.c b/getarch.c index d8f467f03..a8c311035 100644 --- a/getarch.c +++ b/getarch.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -102,6 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_BARCELONA */ /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ +/* #define FORCE_BOBCATE */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ /* #define FORCE_NANO */ @@ -363,6 +364,21 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "BARCELONA" #endif +#if defined(FORCE_BOBCATE) +#define FORCE +#define FORCE_INTEL +#define ARCHITECTURE "X86" +#define SUBARCHITECTURE "BOBCATE" +#define ARCHCONFIG "-DBOBCATE " \ + "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ + "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ + "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ + "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ + "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" +#define LIBNAME "bobcate" +#define CORENAME "BOBCATE" +#endif + #ifdef FORCE_SSE_GENERIC #define FORCE #define FORCE_INTEL diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index e841bb171..4f438d5af 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -794,6 +794,22 @@ static void init_parameter(void) { #endif #endif +#ifdef BOBCATE + +#ifdef DEBUG + fprintf(stderr, "Bobcate\n"); +#endif + + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; +#endif +#endif + #ifdef NANO #ifdef DEBUG diff --git a/kernel/x86/KERNEL.BOBCATE b/kernel/x86/KERNEL.BOBCATE new file mode 100644 index 000000000..231350a62 --- /dev/null +++ b/kernel/x86/KERNEL.BOBCATE @@ -0,0 +1,59 @@ +SGEMMKERNEL = gemm_kernel_4x4_barcelona.S +SGEMMINCOPY = +SGEMMITCOPY = +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = +SGEMMITCOPYOBJ = +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_2x4_barcelona.S +DGEMMINCOPY = ../generic/gemm_ncopy_2.c +DGEMMITCOPY = ../generic/gemm_tcopy_2.c +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +CGEMMINCOPY = +CGEMMITCOPY = +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = +CGEMMITCOPYOBJ = +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_1x2_barcelona.S +ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c +ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) +ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_4x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_4x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_4x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_2x4_sse2.S +DTRSMKERNEL_LT = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RN = trsm_kernel_LT_2x4_sse2.S +DTRSMKERNEL_RT = trsm_kernel_RT_2x4_sse2.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_1x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_1x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_2x4_barcelona.S diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S index 9a7a466a6..f16dda05f 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -439,7 +439,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -488,7 +488,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1697,7 +1697,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1727,7 +1727,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S index 147ed19bd..455096a63 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -437,7 +437,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -833,7 +833,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1848,7 +1848,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2109,7 +2109,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2429,7 +2429,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -2459,7 +2459,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2952,7 +2952,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -3148,7 +3148,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3389,7 +3389,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -3404,7 +3404,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S index e4f59819b..0222caccb 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -910,7 +910,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -959,7 +959,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1439,7 +1439,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1469,7 +1469,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S index 8d6189865..4c38714da 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -872,7 +872,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1316,7 +1316,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1855,7 +1855,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1885,7 +1885,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2249,7 +2249,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2562,7 +2562,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2957,7 +2957,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -2972,7 +2972,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -3280,7 +3280,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3515,7 +3515,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S index 6c2682a10..94a479474 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -1036,7 +1036,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1066,7 +1066,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 @@ -2224,7 +2224,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -2273,7 +2273,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S index 0d2fcb6d2..95e3d469b 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -439,7 +439,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -454,7 +454,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -758,7 +758,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -993,7 +993,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -1324,7 +1324,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1354,7 +1354,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1718,7 +1718,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2031,7 +2031,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2859,7 +2859,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -3303,7 +3303,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S index f77a06d6c..f75f0ae08 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -533,7 +533,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S index 84d40ddec..be5aa54b9 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -994,7 +994,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S index bce0b0252..e0f37c3e2 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -1820,7 +1820,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86_64/KERNEL.BOBCATE b/kernel/x86_64/KERNEL.BOBCATE new file mode 100644 index 000000000..051a52286 --- /dev/null +++ b/kernel/x86_64/KERNEL.BOBCATE @@ -0,0 +1,62 @@ +ZGEMVNKERNEL = zgemv_n_dup.S +ZGEMVTKERNEL = zgemv_t_dup.S + +SGEMMKERNEL = gemm_kernel_8x4_barcelona.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = gemm_ncopy_4_opteron.S +SGEMMOTCOPY = gemm_tcopy_4_opteron.S +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +DGEMMKERNEL = gemm_kernel_4x4_barcelona.S +DGEMMINCOPY = +DGEMMITCOPY = +DGEMMONCOPY = gemm_ncopy_4_opteron.S +DGEMMOTCOPY = gemm_tcopy_4_opteron.S +DGEMMINCOPYOBJ = +DGEMMITCOPYOBJ = +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +CGEMMKERNEL = zgemm_kernel_4x2_barcelona.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = zgemm_ncopy_2.S +CGEMMOTCOPY = zgemm_tcopy_2.S +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +ZGEMMKERNEL = zgemm_kernel_2x2_barcelona.S +ZGEMMINCOPY = +ZGEMMITCOPY = +ZGEMMONCOPY = zgemm_ncopy_2.S +ZGEMMOTCOPY = zgemm_tcopy_2.S +ZGEMMINCOPYOBJ = +ZGEMMITCOPYOBJ = +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) + +STRSMKERNEL_LN = trsm_kernel_LN_8x4_sse.S +STRSMKERNEL_LT = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RN = trsm_kernel_LT_8x4_sse.S +STRSMKERNEL_RT = trsm_kernel_RT_8x4_sse.S + +DTRSMKERNEL_LN = trsm_kernel_LN_4x4_barcelona.S +DTRSMKERNEL_LT = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RN = trsm_kernel_LT_4x4_barcelona.S +DTRSMKERNEL_RT = trsm_kernel_RT_4x4_barcelona.S + +CTRSMKERNEL_LN = ztrsm_kernel_LN_4x2_sse.S +CTRSMKERNEL_LT = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RN = ztrsm_kernel_LT_4x2_sse.S +CTRSMKERNEL_RT = ztrsm_kernel_RT_4x2_sse.S + +ZTRSMKERNEL_LN = ztrsm_kernel_LN_2x2_sse2.S +ZTRSMKERNEL_LT = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RN = ztrsm_kernel_LT_2x2_sse2.S +ZTRSMKERNEL_RT = ztrsm_kernel_RT_2x2_sse2.S + +CGEMM3MKERNEL = zgemm3m_kernel_8x4_barcelona.S +ZGEMM3MKERNEL = zgemm3m_kernel_4x4_barcelona.S diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index 9db45a642..af7afafcc 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index ca03f86b7..a01d4def6 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 01ad2d96e..958f26df8 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 60c1ea778..580f6d1f8 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index fc54dc4a5..aa46ba68b 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -160,7 +160,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index eae31b955..14d696024 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index 4d6ad3326..ded298a98 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index 2623bfe6d..fb20a1a2a 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/l1param.h b/l1param.h index 61c61aa94..aef675633 100644 --- a/l1param.h +++ b/l1param.h @@ -67,6 +67,13 @@ #define ALIGNED_ACCESS #endif +#ifdef BOBCATE +#define PREFETCH prefetch +#define PREFETCHW prefetchw +#define PREFETCHSIZE (128 * 5) +#define ALIGNED_ACCESS +#endif + #ifdef NANO #define PREFETCH prefetcht0 #define PREFETCHW prefetcht0 diff --git a/l2param.h b/l2param.h index a371b2ded..a2b632e97 100644 --- a/l2param.h +++ b/l2param.h @@ -85,7 +85,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps diff --git a/param.h b/param.h index 53159a4fd..f0e49cc8b 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -208,6 +208,68 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif +#define SGEMM_DEFAULT_R sgemm_r +#define QGEMM_DEFAULT_R qgemm_r +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R cgemm_r +#define ZGEMM_DEFAULT_R zgemm_r +#define XGEMM_DEFAULT_R xgemm_r + +#define SYMV_P 16 +#define HAVE_EXCLUSIVE_CACHE + +#define GEMM_THREAD gemm_thread_mn + +#endif + +#if defined(BOBCATE) + +#define SNUMOPT 8 +#define DNUMOPT 4 + +#define GEMM_DEFAULT_OFFSET_A 64 +#define GEMM_DEFAULT_OFFSET_B 832 +#define GEMM_DEFAULT_ALIGN 0x0fffUL + +#define SGEMM_DEFAULT_UNROLL_N 4 +#define DGEMM_DEFAULT_UNROLL_N 4 +#define QGEMM_DEFAULT_UNROLL_N 2 +#define CGEMM_DEFAULT_UNROLL_N 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#define XGEMM_DEFAULT_UNROLL_N 1 + +#ifdef ARCH_X86 +#define SGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_M 2 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_M 1 +#define XGEMM_DEFAULT_UNROLL_M 1 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 4 +#define QGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define XGEMM_DEFAULT_UNROLL_M 1 +#endif + + +#define SGEMM_DEFAULT_P 448 +#define DGEMM_DEFAULT_P 224 +#define QGEMM_DEFAULT_P 112 +#define CGEMM_DEFAULT_P 224 +#define ZGEMM_DEFAULT_P 112 +#define XGEMM_DEFAULT_P 56 + +#define SGEMM_DEFAULT_Q 224 +#define DGEMM_DEFAULT_Q 224 +#define QGEMM_DEFAULT_Q 224 +#define CGEMM_DEFAULT_Q 224 +#define ZGEMM_DEFAULT_Q 224 +#define XGEMM_DEFAULT_Q 224 + + #define SGEMM_DEFAULT_R sgemm_r #define QGEMM_DEFAULT_R qgemm_r #define DGEMM_DEFAULT_R dgemm_r From d3b67d0bd85f7036954ebcda6d2d7dcc20c5da19 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 31 May 2012 22:40:15 +0800 Subject: [PATCH 26/46] Refs #113. Fixed the typo BOBCATE -> BOBCAT --- TargetList.txt | 2 +- cpuid.h | 4 ++-- cpuid_x86.c | 12 +++++----- driver/others/parameter.c | 4 ++-- getarch.c | 12 +++++----- kernel/setparam-ref.c | 2 +- kernel/x86/{KERNEL.BOBCATE => KERNEL.BOBCAT} | 0 kernel/x86/trsm_kernel_LN_2x4_sse2.S | 10 ++++----- kernel/x86/trsm_kernel_LN_4x4_sse.S | 22 +++++++++---------- kernel/x86/trsm_kernel_LT_2x4_sse2.S | 10 ++++----- kernel/x86/trsm_kernel_LT_4x4_sse.S | 22 +++++++++---------- kernel/x86/trsm_kernel_RT_2x4_sse2.S | 10 ++++----- kernel/x86/trsm_kernel_RT_4x4_sse.S | 22 +++++++++---------- kernel/x86/ztrsm_kernel_LN_2x2_sse.S | 4 ++-- kernel/x86/ztrsm_kernel_LT_2x2_sse.S | 4 ++-- kernel/x86/ztrsm_kernel_RT_2x2_sse.S | 4 ++-- .../x86_64/{KERNEL.BOBCATE => KERNEL.BOBCAT} | 0 kernel/x86_64/symv_L_sse.S | 2 +- kernel/x86_64/symv_L_sse2.S | 2 +- kernel/x86_64/symv_U_sse.S | 2 +- kernel/x86_64/symv_U_sse2.S | 2 +- kernel/x86_64/zsymv_L_sse.S | 2 +- kernel/x86_64/zsymv_L_sse2.S | 2 +- kernel/x86_64/zsymv_U_sse.S | 2 +- kernel/x86_64/zsymv_U_sse2.S | 2 +- l1param.h | 2 +- l2param.h | 2 +- param.h | 2 +- 28 files changed, 83 insertions(+), 83 deletions(-) rename kernel/x86/{KERNEL.BOBCATE => KERNEL.BOBCAT} (100%) rename kernel/x86_64/{KERNEL.BOBCATE => KERNEL.BOBCAT} (100%) diff --git a/TargetList.txt b/TargetList.txt index 19008b862..1a212e6ca 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -28,7 +28,7 @@ OPTERON_SSE3 BARCELONA SHANGHAI ISTANBUL -BOBCATE +BOBCAT c)VIA CPU: SSE_GENERIC diff --git a/cpuid.h b/cpuid.h index 1678d0a7e..fdcfcea00 100644 --- a/cpuid.h +++ b/cpuid.h @@ -104,7 +104,7 @@ #define CORE_ATOM 18 #define CORE_NANO 19 #define CORE_SANDYBRIDGE 20 -#define CORE_BOBCATE 21 +#define CORE_BOBCAT 21 #define HAVE_SSE (1 << 0) #define HAVE_SSE2 (1 << 1) @@ -192,5 +192,5 @@ typedef struct { #define CPUTYPE_VIAC3 42 #define CPUTYPE_NANO 43 #define CPUTYPE_SANDYBRIDGE 44 -#define CPUTYPE_BOBCATE 45 +#define CPUTYPE_BOBCAT 45 #endif diff --git a/cpuid_x86.c b/cpuid_x86.c index d31146a98..204f41d51 100644 --- a/cpuid_x86.c +++ b/cpuid_x86.c @@ -1029,7 +1029,7 @@ int get_cpuname(void){ case 10: return CPUTYPE_BARCELONA; case 5: - return CPUTYPE_BOBCATE; + return CPUTYPE_BOBCAT; } break; } @@ -1150,7 +1150,7 @@ static char *cpuname[] = { "VIAC3", "NANO", "SANDYBRIDGE", - "BOBCATE", + "BOBCAT", }; static char *lowercpuname[] = { @@ -1198,7 +1198,7 @@ static char *lowercpuname[] = { "nsgeode", "nano", "sandybridge", - "bobcate", + "bobcat", }; static char *corename[] = { @@ -1223,7 +1223,7 @@ static char *corename[] = { "ATOM", "NANO", "SANDYBRIDGE", - "BOBCATE", + "BOBCAT", }; static char *corename_lower[] = { @@ -1248,7 +1248,7 @@ static char *corename_lower[] = { "atom", "nano", "sandybridge", - "bobcate", + "bobcat", }; @@ -1358,7 +1358,7 @@ int get_coretype(void){ if (family <= 0xe) return CORE_ATHLON; if (family == 0xf){ if ((exfamily == 0) || (exfamily == 2)) return CORE_OPTERON; - else if (exfamily == 5) return CORE_BOBCATE; + else if (exfamily == 5) return CORE_BOBCAT; else return CORE_BARCELONA; } } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index ab90b89f0..d261e5a4e 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -163,7 +163,7 @@ int get_L2_size(void){ int eax, ebx, ecx, edx; -#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) || \ +#if defined(ATHLON) || defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) || \ defined(CORE_PRESCOTT) || defined(CORE_CORE2) || defined(PENRYN) || defined(DUNNINGTON) || \ defined(CORE_NEHALEM) || defined(CORE_SANDYBRIDGE) || defined(ATOM) || defined(GENERIC) @@ -446,7 +446,7 @@ void blas_set_parameter(void){ #endif #endif -#if defined(CORE_BARCELONA) || defined(CORE_BOBCATE) +#if defined(CORE_BARCELONA) || defined(CORE_BOBCAT) size >>= 8; sgemm_p = 232 * size; diff --git a/getarch.c b/getarch.c index a8c311035..7e08e774e 100644 --- a/getarch.c +++ b/getarch.c @@ -102,7 +102,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_BARCELONA */ /* #define FORCE_SHANGHAI */ /* #define FORCE_ISTANBUL */ -/* #define FORCE_BOBCATE */ +/* #define FORCE_BOBCAT */ /* #define FORCE_SSE_GENERIC */ /* #define FORCE_VIAC3 */ /* #define FORCE_NANO */ @@ -364,19 +364,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CORENAME "BARCELONA" #endif -#if defined(FORCE_BOBCATE) +#if defined(FORCE_BOBCAT) #define FORCE #define FORCE_INTEL #define ARCHITECTURE "X86" -#define SUBARCHITECTURE "BOBCATE" -#define ARCHCONFIG "-DBOBCATE " \ +#define SUBARCHITECTURE "BOBCAT" +#define ARCHCONFIG "-DBOBCAT " \ "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \ "-DL2_SIZE=524288 -DL2_LINESIZE=64 " \ "-DDTB_DEFAULT_ENTRIES=40 -DDTB_SIZE=4096 " \ "-DHAVE_MMX -DHAVE_SSE -DHAVE_SSE2 -DHAVE_SSE3 -DHAVE_SSSE3 " \ "-DHAVE_SSE4A -DHAVE_MISALIGNSSE -DHAVE_CFLUSH -DHAVE_CMOV" -#define LIBNAME "bobcate" -#define CORENAME "BOBCATE" +#define LIBNAME "bobcat" +#define CORENAME "BOBCAT" #endif #ifdef FORCE_SSE_GENERIC diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index 4f438d5af..f57b425e6 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -794,7 +794,7 @@ static void init_parameter(void) { #endif #endif -#ifdef BOBCATE +#ifdef BOBCAT #ifdef DEBUG fprintf(stderr, "Bobcate\n"); diff --git a/kernel/x86/KERNEL.BOBCATE b/kernel/x86/KERNEL.BOBCAT similarity index 100% rename from kernel/x86/KERNEL.BOBCATE rename to kernel/x86/KERNEL.BOBCAT diff --git a/kernel/x86/trsm_kernel_LN_2x4_sse2.S b/kernel/x86/trsm_kernel_LN_2x4_sse2.S index f16dda05f..2b6877a31 100644 --- a/kernel/x86/trsm_kernel_LN_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LN_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -439,7 +439,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -488,7 +488,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1697,7 +1697,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1727,7 +1727,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LN_4x4_sse.S b/kernel/x86/trsm_kernel_LN_4x4_sse.S index 455096a63..82bb1d3ec 100644 --- a/kernel/x86/trsm_kernel_LN_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LN_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -437,7 +437,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -833,7 +833,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1848,7 +1848,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2109,7 +2109,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2429,7 +2429,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -2459,7 +2459,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2952,7 +2952,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -3148,7 +3148,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3389,7 +3389,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -3404,7 +3404,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 diff --git a/kernel/x86/trsm_kernel_LT_2x4_sse2.S b/kernel/x86/trsm_kernel_LT_2x4_sse2.S index 0222caccb..d81177b7e 100644 --- a/kernel/x86/trsm_kernel_LT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_LT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -910,7 +910,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -959,7 +959,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 @@ -1439,7 +1439,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1469,7 +1469,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_LT_4x4_sse.S b/kernel/x86/trsm_kernel_LT_4x4_sse.S index 4c38714da..854c44e7a 100644 --- a/kernel/x86/trsm_kernel_LT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_LT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -872,7 +872,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -1316,7 +1316,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 @@ -1855,7 +1855,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1885,7 +1885,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -2249,7 +2249,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2562,7 +2562,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2957,7 +2957,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -2972,7 +2972,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -3280,7 +3280,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -3515,7 +3515,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 diff --git a/kernel/x86/trsm_kernel_RT_2x4_sse2.S b/kernel/x86/trsm_kernel_RT_2x4_sse2.S index 94a479474..f7a08c699 100644 --- a/kernel/x86/trsm_kernel_RT_2x4_sse2.S +++ b/kernel/x86/trsm_kernel_RT_2x4_sse2.S @@ -69,7 +69,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHSIZE (8 * 10 + 4) #endif @@ -1036,7 +1036,7 @@ .L42: mulpd %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulpd 2 * SIZE(BB), %xmm0 @@ -1066,7 +1066,7 @@ addpd %xmm0, %xmm7 movapd 16 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 8) * SIZE(AA) #endif mulpd %xmm1, %xmm2 @@ -2224,7 +2224,7 @@ .L22: mulsd %xmm0, %xmm2 addsd %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 0) * SIZE(AA) #endif movlpd 2 * SIZE(BB), %xmm2 @@ -2273,7 +2273,7 @@ movlpd 40 * SIZE(BB), %xmm3 addsd %xmm0, %xmm7 movlpd 8 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) PREFETCH (PREFETCHSIZE + 8) * SIZE(AA) #endif mulsd %xmm1, %xmm2 diff --git a/kernel/x86/trsm_kernel_RT_4x4_sse.S b/kernel/x86/trsm_kernel_RT_4x4_sse.S index 95e3d469b..80dc2451c 100644 --- a/kernel/x86/trsm_kernel_RT_4x4_sse.S +++ b/kernel/x86/trsm_kernel_RT_4x4_sse.S @@ -64,7 +64,7 @@ #define BORIG 60(%esp) #define BUFFER 128(%esp) -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 10 + 8) @@ -439,7 +439,7 @@ .L92: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(AA), %xmm0 @@ -454,7 +454,7 @@ mulps 12 * SIZE(BB), %xmm0 addps %xmm0, %xmm7 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm3 @@ -758,7 +758,7 @@ .L102: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movsd 2 * SIZE(AA), %xmm0 @@ -993,7 +993,7 @@ .L112: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 1 * SIZE(AA), %xmm0 @@ -1324,7 +1324,7 @@ .L52: mulps %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulps 4 * SIZE(BB), %xmm0 @@ -1354,7 +1354,7 @@ addps %xmm0, %xmm5 movaps 32 * SIZE(AA), %xmm0 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 16) * SIZE(AA) #endif mulps %xmm1, %xmm2 @@ -1718,7 +1718,7 @@ ALIGN_4 .L62: -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif @@ -2031,7 +2031,7 @@ .L72: mulss %xmm0, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif mulss 4 * SIZE(BB), %xmm0 @@ -2859,7 +2859,7 @@ .L22: mulps %xmm0, %xmm2 addps %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movaps 4 * SIZE(BB), %xmm2 @@ -3303,7 +3303,7 @@ .L32: mulss %xmm0, %xmm2 addss %xmm2, %xmm4 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht0 (PREFETCHSIZE + 0) * SIZE(AA) #endif movss 4 * SIZE(BB), %xmm2 diff --git a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S index f75f0ae08..ee9eb9d25 100644 --- a/kernel/x86/ztrsm_kernel_LN_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LN_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -533,7 +533,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S index be5aa54b9..9ef572470 100644 --- a/kernel/x86/ztrsm_kernel_LT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_LT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -994,7 +994,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S index e0f37c3e2..cd1bf2f53 100644 --- a/kernel/x86/ztrsm_kernel_RT_2x2_sse.S +++ b/kernel/x86/ztrsm_kernel_RT_2x2_sse.S @@ -75,7 +75,7 @@ #define STACK_ALIGN 4096 #define STACK_OFFSET 1024 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) #define PREFETCHSIZE (16 * 10 + 8) #define WPREFETCHSIZE 112 #define PREFETCH prefetch @@ -1820,7 +1820,7 @@ addps %xmm0, %xmm7 movsd 16 * SIZE(AA), %xmm0 mulps %xmm1, %xmm2 -#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCATE) +#if defined(OPTERON) || defined(BARCELONA) || defined(BOBCAT) prefetcht1 (PREFETCHSIZE + 16) * SIZE(AA) #endif addps %xmm2, %xmm4 diff --git a/kernel/x86_64/KERNEL.BOBCATE b/kernel/x86_64/KERNEL.BOBCAT similarity index 100% rename from kernel/x86_64/KERNEL.BOBCATE rename to kernel/x86_64/KERNEL.BOBCAT diff --git a/kernel/x86_64/symv_L_sse.S b/kernel/x86_64/symv_L_sse.S index af7afafcc..5a123d7f6 100644 --- a/kernel/x86_64/symv_L_sse.S +++ b/kernel/x86_64/symv_L_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_L_sse2.S b/kernel/x86_64/symv_L_sse2.S index a01d4def6..8afdc87db 100644 --- a/kernel/x86_64/symv_L_sse2.S +++ b/kernel/x86_64/symv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse.S b/kernel/x86_64/symv_U_sse.S index 958f26df8..5aef6b461 100644 --- a/kernel/x86_64/symv_U_sse.S +++ b/kernel/x86_64/symv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/symv_U_sse2.S b/kernel/x86_64/symv_U_sse2.S index 580f6d1f8..fa1bfba85 100644 --- a/kernel/x86_64/symv_U_sse2.S +++ b/kernel/x86_64/symv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_L_sse.S b/kernel/x86_64/zsymv_L_sse.S index aa46ba68b..6af65a4ba 100644 --- a/kernel/x86_64/zsymv_L_sse.S +++ b/kernel/x86_64/zsymv_L_sse.S @@ -160,7 +160,7 @@ #define a3 %xmm14 #define xt1 %xmm15 -#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if (defined(HAVE_SSE3) && !defined(CORE_OPTERON)) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define MOVDDUP(a, b, c) movddup a(b), c #define MOVDDUP2(a, b, c) movddup a##b, c #else diff --git a/kernel/x86_64/zsymv_L_sse2.S b/kernel/x86_64/zsymv_L_sse2.S index 14d696024..71aca0198 100644 --- a/kernel/x86_64/zsymv_L_sse2.S +++ b/kernel/x86_64/zsymv_L_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_U_sse.S b/kernel/x86_64/zsymv_U_sse.S index ded298a98..4b8422d82 100644 --- a/kernel/x86_64/zsymv_U_sse.S +++ b/kernel/x86_64/zsymv_U_sse.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/kernel/x86_64/zsymv_U_sse2.S b/kernel/x86_64/zsymv_U_sse2.S index fb20a1a2a..33667f79e 100644 --- a/kernel/x86_64/zsymv_U_sse2.S +++ b/kernel/x86_64/zsymv_U_sse2.S @@ -76,7 +76,7 @@ #define movsd movlpd #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (16 * 16) diff --git a/l1param.h b/l1param.h index aef675633..6fe756f17 100644 --- a/l1param.h +++ b/l1param.h @@ -67,7 +67,7 @@ #define ALIGNED_ACCESS #endif -#ifdef BOBCATE +#ifdef BOBCAT #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHSIZE (128 * 5) diff --git a/l2param.h b/l2param.h index a2b632e97..cdbd8805e 100644 --- a/l2param.h +++ b/l2param.h @@ -85,7 +85,7 @@ #define movsd movlps #endif -#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCATE) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define ALIGNED_ACCESS #define MOVUPS_A movaps #define MOVUPS_XL movaps diff --git a/param.h b/param.h index f0e49cc8b..3add52615 100644 --- a/param.h +++ b/param.h @@ -222,7 +222,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BOBCATE) +#if defined(BOBCAT) #define SNUMOPT 8 #define DNUMOPT 4 From eefd30881c3b1f46b3f9490815b1cd3286e63e4d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 2 Jun 2012 21:34:23 +0800 Subject: [PATCH 27/46] Refs #113. Fixed the build bug on AMD Bobcat 64-bit OS. --- kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S | 2 +- kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S | 2 +- kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S | 2 +- kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S index fb428cbf5..b8caa9a44 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LN_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S index e9edc29ac..2db8cbc5d 100644 --- a/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LN_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S index dabc97c3e..16c9ca828 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_LT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S index 7375c3487..dbdbfe2e1 100644 --- a/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_LT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S index 3ab9e5be8..181cdd29c 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S +++ b/kernel/x86_64/ztrsm_kernel_RT_2x2_sse2.S @@ -86,7 +86,7 @@ #define BORIG 72(%rsp) #define BUFFER 128(%rsp) -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta diff --git a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S index 85c0ac231..c28d02927 100644 --- a/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S +++ b/kernel/x86_64/ztrsm_kernel_RT_4x2_sse.S @@ -95,7 +95,7 @@ #define PREFETCHSIZE (8 * 6 + 4) #endif -#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) +#if defined(OPTERON) || defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define PREFETCH prefetch #define PREFETCHW prefetchw #define PREFETCHNTA prefetchnta From f76f9525477785bb452699c07d1985ec14dc2b61 Mon Sep 17 00:00:00 2001 From: wangqian Date: Tue, 19 Jun 2012 16:17:43 +0800 Subject: [PATCH 28/46] Refs #83 #53. Adding Intel Sandy Bridge (AVX supported) kernel codes for BLAS level 3 functions. --- kernel/generic/zgemm_ncopy_4_sandy.c | 235 ++ kernel/generic/zgemm_ncopy_8_sandy.c | 401 +++ kernel/generic/zgemm_tcopy_4_sandy.c | 237 ++ kernel/generic/zgemm_tcopy_8_sandy.c | 370 ++ kernel/x86_64/KERNEL.SANDYBRIDGE | 97 +- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 4478 ++++++++++++++++++++++++ kernel/x86_64/dgemm_kernel_4x8_sandy.S | 3186 +++++++++++++++++ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 3736 ++++++++++++++++++++ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 3257 +++++++++++++++++ param.h | 100 +- 10 files changed, 15982 insertions(+), 115 deletions(-) create mode 100644 kernel/generic/zgemm_ncopy_4_sandy.c create mode 100644 kernel/generic/zgemm_ncopy_8_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_4_sandy.c create mode 100644 kernel/generic/zgemm_tcopy_8_sandy.c create mode 100644 kernel/x86_64/cgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/dgemm_kernel_4x8_sandy.S create mode 100644 kernel/x86_64/sgemm_kernel_8x8_sandy.S create mode 100644 kernel/x86_64/zgemm_kernel_4x4_sandy.S diff --git a/kernel/generic/zgemm_ncopy_4_sandy.c b/kernel/generic/zgemm_ncopy_4_sandy.c new file mode 100644 index 000000000..839bd5939 --- /dev/null +++ b/kernel/generic/zgemm_ncopy_4_sandy.c @@ -0,0 +1,235 @@ +/***************************************************************************** + Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are +met: + +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the ISCAS nor the names of its contributors may +be used to endorse or promote products derived from this software +without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + **********************************************************************************/ + +#include +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*src4,*src5,*src6,*src7,*dest0; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j +#include "common.h" + +int CNAME(BLASLONG row,BLASLONG col,FLOAT* src,BLASLONG srcdim,FLOAT* dest) +{ + BLASLONG i,j; + BLASLONG idx=0; + BLASLONG ii; + FLOAT *src0,*src1,*src2,*src3,*dest0; + FLOAT *dest1,*dest2,*dest4; + ii = col&-8; + ii = ii*(2*row); + dest4 = dest+ii; + ii = col&-4; + ii = ii*(2*row); + dest2 = dest+ii; + ii = col&-2; + ii = ii*(2*row); + dest1 = dest+ii; + for (j=0; j Date: Tue, 19 Jun 2012 17:05:16 +0800 Subject: [PATCH 29/46] Refs #113. Fixed BOBCATE typo in dynamic arch building. --- Makefile.system | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile.system b/Makefile.system index 987bb83cf..425cbb68a 100644 --- a/Makefile.system +++ b/Makefile.system @@ -247,11 +247,11 @@ endif ifdef DYNAMIC_ARCH ifeq ($(ARCH), x86) DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \ - CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO + CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO endif ifeq ($(ARCH), x86_64) -DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCATE ATOM NANO +DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM SANDYBRIDGE OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO endif ifndef DYNAMIC_CORE From 996dc6d1c89e605a721294685db8549cd21e19b3 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Tue, 19 Jun 2012 17:29:06 +0800 Subject: [PATCH 30/46] Fixed dynamic_arch building bug. --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index db9ec6a3b..27aeeb6ac 100644 --- a/param.h +++ b/param.h @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -143,7 +143,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#if defined(BARCELONA) || defined(SHANGHAI) +#if defined(BARCELONA) || defined(SHANGHAI) || defined(BOBCAT) #define SNUMOPT 8 #define DNUMOPT 4 From 3ef96aa567e27ab76f07701b37da1ca0c0c59f39 Mon Sep 17 00:00:00 2001 From: wangqian Date: Tue, 19 Jun 2012 20:37:22 +0800 Subject: [PATCH 31/46] Fixed bug in MOVQ redefine and ALIGN SIZE problem. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 137 +++++++++--------- kernel/x86_64/dgemm_kernel_4x8_sandy.S | 163 +++++++++++----------- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 185 +++++++++++++------------ kernel/x86_64/zgemm_kernel_4x4_sandy.S | 118 ++++++++-------- 4 files changed, 304 insertions(+), 299 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 2b4e4dc64..56ebee120 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -146,6 +146,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -305,7 +306,7 @@ movq %r11, kk; MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -320,7 +321,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -367,7 +368,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; # Computing kernel @@ -591,7 +592,7 @@ ADD2_SY yvec7, yvec8, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -599,7 +600,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_loopB: ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) @@ -717,7 +718,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_loopB:; ######### Unroll 1 ################## PREFETCH0 PRESIZE*SIZE(ptrba) @@ -875,7 +876,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 32 +ALIGN_5 EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; EXTRA_SY $1,yvec13,xvec5; @@ -934,7 +935,7 @@ ADDQ $16*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32 +ALIGN_5 .L4_loopEx: EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL @@ -1077,11 +1078,11 @@ ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32; +ALIGN_5; .L1_loopE:; TEST $4, bm; JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1113,7 +1114,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1242,7 +1243,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L8_bodyB; -.align 32 +ALIGN_5 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1250,7 +1251,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1323,7 +1324,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L10_loopE; -.align 32 +ALIGN_5 .L10_bodyB: #### Unroll times 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -1494,7 +1495,7 @@ ADDQ $8*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; -.align 32 +ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1527,7 +1528,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1652,7 +1653,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; -.align 32 +ALIGN_5 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1660,7 +1661,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1731,7 +1732,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L13_loopE; -.align 32 +ALIGN_5 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 @@ -1875,7 +1876,7 @@ ADDQ $4*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1905,7 +1906,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 32 +ALIGN_5 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1978,7 +1979,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 32 +ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1986,7 +1987,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L15_loopE; -.align 32 +ALIGN_5 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2031,7 +2032,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L16_loopE; -.align 32 +ALIGN_5 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2129,11 +2130,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32 +ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2145,7 +2146,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; -.align 32 +ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2181,7 +2182,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32 +ALIGN_5 .L211_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2430,7 +2431,7 @@ ADDQ $64*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2438,7 +2439,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L212_loopE; -.align 32 +ALIGN_5 .L212_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2571,7 +2572,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -2825,11 +2826,11 @@ ADDQ $16*SIZE, C0; ADDQ $16*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 32 +ALIGN_5 .L21_loopE: TEST $4, bm; JLE .L22_loopE; -.align 32 +ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2862,7 +2863,7 @@ MOVQ %rax, kkk; SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3002,7 +3003,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3010,7 +3011,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3089,7 +3090,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3237,7 +3238,7 @@ ADDQ $8*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; -.align 32 +ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3267,7 +3268,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3351,7 +3352,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3359,7 +3360,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3409,7 +3410,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; @@ -3503,7 +3504,7 @@ ADDQ $4*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; -.align 32 +ALIGN_5 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3532,7 +3533,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 32 +ALIGN_5 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3585,7 +3586,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L242_loopE; -.align 32 +ALIGN_5 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3616,7 +3617,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L243_loopE; -.align 32 +ALIGN_5 .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -3684,7 +3685,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -3695,7 +3696,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: MOVQ bb, ptrbb; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) @@ -3727,7 +3728,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3800,7 +3801,7 @@ ADDQ $64*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3808,7 +3809,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3853,7 +3854,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -3941,11 +3942,11 @@ ADDQ $8, kk; ADDQ $16*SIZE, C0; DECQ i; JG .L31_bodyB; -.align 32 +ALIGN_5 .L31_loopE: TEST $4, bm; JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3974,7 +3975,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4023,7 +4024,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4031,7 +4032,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4064,7 +4065,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: #### Unroll 1 #### LD_SY 0*SIZE(ptrba), yvec0; @@ -4128,7 +4129,7 @@ ADDQ $8*SIZE, C0; .L32_loopE: TEST $2, bm; JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -4157,7 +4158,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4202,7 +4203,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4210,7 +4211,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4241,7 +4242,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; @@ -4300,7 +4301,7 @@ ADDQ $4*SIZE, C0; .L33_loopE: TEST $1, bm; JLE .L34_loopE; -.align 32 +ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -4329,7 +4330,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 32 +ALIGN_5 .L341_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -4354,7 +4355,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L341_bodyB; -.align 32 +ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -4362,7 +4363,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L342_loopE; -.align 32 +ALIGN_5 .L342_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -4383,7 +4384,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L343_loopE; -.align 32 +ALIGN_5 .L343_bodyB: XOR_SY yvec0, yvec0, yvec0; XOR_SY yvec2, yvec2, yvec2; diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index fea5ecb4a..c98879d7c 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -140,6 +140,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JNE jne #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -265,7 +266,7 @@ movq %r11, kk MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -281,7 +282,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; # Rm = 8 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -328,7 +329,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; # Computing kernel @@ -448,7 +449,7 @@ ADD_DY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; PREFETCH2 0*SIZE(prebb); ADDQ $8*SIZE, prebb; @@ -459,7 +460,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_bodyB: #### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) @@ -529,7 +530,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_bodyB:; #### Unroll times 1 #### PREFETCH0 64*SIZE(ptrba) @@ -588,7 +589,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; # Unalign part write back -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; @@ -648,7 +649,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32; +ALIGN_5; .L4_loopEx:; EXTRA_DY $1, yvec15, xvec7; #ifndef TRMMKERNEL @@ -776,11 +777,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32 +ALIGN_5 .L1_loopE:; TEST $4, bm; # Rm = 4 JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -816,7 +817,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L6_loopE; -.align 32; +ALIGN_5; .L6_bodyB:; # Computing kernel @@ -887,7 +888,7 @@ MUL_DY yvec1, yvec5, yvec7; ADD_DY yvec9, yvec7, yvec9; DECQ k; JG .L6_bodyB; -.align 32 +ALIGN_5 .L6_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -896,7 +897,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB:; #### Untoll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; @@ -940,7 +941,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB:; #### Untoll time 1 #### MUL_DY yvec0, yvec2, yvec6; @@ -977,7 +978,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L8_loopEx; # Unalign part write back -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; @@ -1014,7 +1015,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 32 +ALIGN_5 .L8_loopEx:; EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec13,xvec5; @@ -1080,7 +1081,7 @@ ADDQ $4*SIZE, C1; .L5_loopE:; TEST $2, bm; JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1117,7 +1118,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; -.align 32; +ALIGN_5; .L10_bodyB:; # Computing kernel @@ -1192,7 +1193,7 @@ MUL_DX xvec1, xvec5; ADD_DX xvec5, xvec9; DECQ k; JG .L10_bodyB; -.align 32 +ALIGN_5 .L10_loopE:; #ifndef TRMMKERNEL TEST $2, bk @@ -1201,7 +1202,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB:; ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; @@ -1248,7 +1249,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB:; SHUF_DX $0x4e, xvec3, xvec5; MUL_DX xvec0, xvec2; @@ -1285,7 +1286,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L12_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec13; @@ -1310,7 +1311,7 @@ ADDQ $2, kk ADDQ $2*SIZE, C0 ADDQ $2*SIZE, C1 JMP .L9_loopE; -.align 32 +ALIGN_5 .L12_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec14; @@ -1349,7 +1350,7 @@ ADDQ $2*SIZE, C1; .L9_loopE:; TEST $1, bm JLE .L13_loopE; -.align 32 +ALIGN_5 .L13_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1379,7 +1380,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 32 +ALIGN_5 .L14_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1404,7 +1405,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 32 +ALIGN_5 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1413,7 +1414,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L15_loopE; -.align 32 +ALIGN_5 .L15_bodyB: BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1434,7 +1435,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L16_loopE; -.align 32 +ALIGN_5 .L16_bodyB:; BROAD_DY 0*SIZE(ptrba), yvec0; LD_DY 0*SIZE(ptrbb), yvec2; @@ -1485,11 +1486,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32; +ALIGN_5; .L20_loopB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1501,7 +1502,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; # Rm = 8 JLE .L21_loopE; -.align 32; +ALIGN_5; .L21_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1538,7 +1539,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32; +ALIGN_5; .L211_bodyB: # Computing kernel #### Unroll time 1 #### @@ -1692,7 +1693,7 @@ MUL_DX xvec3, xvec7; ADD_DX xvec7, xvec8; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1701,7 +1702,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L212_loopE; -.align 32; +ALIGN_5; .L212_bodyB: # Computing kernel #### Unroll time 1 #### @@ -1788,7 +1789,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -1858,7 +1859,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -1893,7 +1894,7 @@ ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; -.align 32 +ALIGN_5 .L213_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -1956,7 +1957,7 @@ JG .L21_bodyB; .L21_loopE:; TEST $4, bm; # Rm = 4 JLE .L22_loopE; -.align 32; +ALIGN_5; .L22_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1989,7 +1990,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB:; # Computing kernel #### Unroll time 1 #### @@ -2071,7 +2072,7 @@ MUL_DX xvec1, xvec5; ADD_DX xvec5, xvec10; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -2080,7 +2081,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2129,7 +2130,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2171,7 +2172,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L223_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -2196,7 +2197,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L22_loopE; -.align 32 +ALIGN_5 .L223_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2237,7 +2238,7 @@ ADDQ $4*SIZE, C1; .L22_loopE:; TEST $2, bm; # Rm = 2 JLE .L23_loopE; -.align 32; +ALIGN_5; .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2267,7 +2268,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: # Computing kernel #### Unroll time 1 #### @@ -2309,7 +2310,7 @@ ADD_DX xvec5, xvec11; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2318,7 +2319,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2347,7 +2348,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: #### Unroll time 1 #### LD_DX 0*SIZE(ptrba), xvec0; @@ -2373,7 +2374,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L233_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0), xvec11; @@ -2394,7 +2395,7 @@ ADDQ $2, kk; ADDQ $2*SIZE, C0; ADDQ $2*SIZE, C1; JMP .L23_loopE; -.align 32 +ALIGN_5 .L233_loopEx:; #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2425,7 +2426,7 @@ ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; # Rm = 1 JLE .L24_loopE; -.align 32; +ALIGN_5; .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2454,7 +2455,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 32 +ALIGN_5 .L241_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2479,7 +2480,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align 32 +ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2488,7 +2489,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L242_loopE; -.align 32 +ALIGN_5 .L242_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2509,7 +2510,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L243_loopE; -.align 32 +ALIGN_5 .L243_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; @@ -2551,7 +2552,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE:; TEST $1, bn; # Rn = 1 JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; @@ -2562,7 +2563,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2593,7 +2594,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2634,7 +2635,7 @@ ADD_DY yvec4, yvec14, yvec14; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2643,7 +2644,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2673,7 +2674,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: #### Unroll time 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -2696,7 +2697,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L313_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; @@ -2724,7 +2725,7 @@ ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; JMP .L31_loopE; -.align 32 +ALIGN_5 .L313_loopEx: EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; @@ -2766,7 +2767,7 @@ JG .L31_bodyB; .L31_loopE: TEST $4, bm JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2796,7 +2797,7 @@ MOVQ %rax, kkk #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2821,7 +2822,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2830,7 +2831,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2852,7 +2853,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec1; @@ -2870,7 +2871,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L323_loopEx; -.align 32 +ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL @@ -2891,7 +2892,7 @@ ADDQ $4, kk #endif ADDQ $4*SIZE, C0; JMP .L32_loopE; -.align 32 +ALIGN_5 .L323_loopEx: #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; @@ -2921,7 +2922,7 @@ ADDQ $4*SIZE, C0; .L32_loopE: TEST $2, bm JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -2951,7 +2952,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -2976,7 +2977,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2,bk; @@ -2985,7 +2986,7 @@ MOVQ kkk, %rax; TEST $2, %rax #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3006,7 +3007,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3039,7 +3040,7 @@ ADDQ $2*SIZE, C0; .L33_loopE: TEST $1, bm JLE .L34_loopE; -.align 32 +ALIGN_5 .L34_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3068,7 +3069,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 32 +ALIGN_5 .L341_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; @@ -3093,7 +3094,7 @@ addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L341_bodyB; -.align 32 +ALIGN_5 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3102,7 +3103,7 @@ MOVQ kkk, %rax; TEST $2, %rax; #endif JLE .L342_loopE; -.align 32 +ALIGN_5 .L342_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; @@ -3124,7 +3125,7 @@ MOVQ kkk, %rax; TEST $1, %rax; #endif JLE .L343_loopE; -.align 32 +ALIGN_5 .L343_bodyB: movsd 0*SIZE(ptrba), xvec0; movsd 0*SIZE(ptrbb), xvec1; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 44f8f1802..4d16a60d0 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ #define MOVQ movq #define XOR_SY vxorps @@ -273,7 +274,7 @@ movq %r11, kk MOVQ bn,j; SARQ $3,j; JLE .L0_loopE; -.align 16; +ALIGN_4; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -289,7 +290,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $3,i; JLE .L1_loopE; -.align 16; +ALIGN_4; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -342,7 +343,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L2_loopE; -.align 16; +ALIGN_4; .L2_bodyB:; # Computing kernel @@ -472,7 +473,7 @@ ADD_SY yvec8, yvec7, yvec8; .L2_bodyE:; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_4 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -480,7 +481,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_4 .L3_loobB: #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; @@ -550,7 +551,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_4 .L4_loopB:; #### Unroll times 1 #### MUL_SY yvec0, yvec2, yvec6; @@ -609,7 +610,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 16 +ALIGN_4 LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1,yvec15,xvec7; EXTRA_SY $1,yvec14,xvec6; @@ -669,7 +670,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 16; +ALIGN_4; .L4_loopEx: LEAQ (ldc,ldc,2),%rax; EXTRA_SY $1, yvec15, xvec7; @@ -813,11 +814,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 16 +ALIGN_4 .L1_loopE:; TEST $4, bm; JLE .L5_loopE; -.align 16 +ALIGN_4 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -857,7 +858,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L8_loopE; -.align 16 +ALIGN_4 .L8_bodyB: #### Unroll time 1 #### @@ -983,7 +984,7 @@ MUL_SX xvec1, xvec5; ADD_SX xvec5, xvec8; DECQ k; JG .L8_bodyB; -.align 16 +ALIGN_4 .L8_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -991,7 +992,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L9_loopE; -.align 16 +ALIGN_4 .L9_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; @@ -1062,7 +1063,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L10_loopE; -.align 16 +ALIGN_4 .L10_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; @@ -1122,7 +1123,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L10_loopEx; -.align 16 +ALIGN_4 LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL ADD_SX 0*SIZE(C0), xvec15; @@ -1155,7 +1156,7 @@ ADDQ $4, kk ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 16 +ALIGN_4 .L10_loopEx: LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL @@ -1215,7 +1216,7 @@ ADDQ $4*SIZE, C1; .L5_loopE: TEST $2, bm; JLE .L6_loopE; -.align 16 +ALIGN_4 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1249,7 +1250,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L11_loopE; -.align 16 +ALIGN_4 .L11_bodyB: #### Computing kernel LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 @@ -1318,7 +1319,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L11_bodyB; -.align 16 +ALIGN_4 .L11_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1326,7 +1327,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L12_loopE; -.align 16 +ALIGN_4 .L12_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 @@ -1368,7 +1369,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L13_loopE; -.align 16 +ALIGN_4 .L13_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 @@ -1433,7 +1434,7 @@ ADDQ $2*SIZE, C1; .L6_loopE: TEST $1, bm; JLE .L7_loopE; -.align 16 +ALIGN_4 .L7_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1465,7 +1466,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L14_loopE; -.align 16 +ALIGN_4 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1503,7 +1504,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L14_bodyB; -.align 16 +ALIGN_4 .L14_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1511,7 +1512,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L15_loopE; -.align 16 +ALIGN_4 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1538,7 +1539,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L16_loopE; -.align 16 +ALIGN_4 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -1611,11 +1612,11 @@ LEAQ (C,ldc,8),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 16; +ALIGN_4; .L0_loopE:; TEST $4, bn; # Rn = 4 JLE .L20_loopE; -.align 16; +ALIGN_4; .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1628,7 +1629,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L21_loopE; -.align 16 +ALIGN_4 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -1668,7 +1669,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; JLE .L211_loopE; -.align 16 +ALIGN_4 .L211_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; @@ -1800,7 +1801,7 @@ ADD_SX xvec7, xvec8; LD_SX 4*SIZE(ptrba), xvec1; DECQ k; JG .L211_bodyB; -.align 16 +ALIGN_4 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk @@ -1808,7 +1809,7 @@ TEST $2, bk TEST $2, kkk; #endif JLE .L212_loopE; -.align 16 +ALIGN_4 .L212_bodyB: #### Unroll time 1 #### ODUP_SX 0*SIZE(ptrbb), xvec3; @@ -1882,7 +1883,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 16 +ALIGN_4 .L213_bodyB: ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; @@ -1982,11 +1983,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 16 +ALIGN_4 .L21_loopE: TEST $4, bm; JLE .L22_loopE; -.align 16 +ALIGN_4 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2019,7 +2020,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 16 +ALIGN_4 .L221_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2089,7 +2090,7 @@ ADDQ $16*SIZE, ptrbb; DECQ k; JG .L221_bodyB; -.align 16 +ALIGN_4 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2097,7 +2098,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 16 +ALIGN_4 .L222_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2139,7 +2140,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 16 +ALIGN_4 .L223_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec2; @@ -2203,7 +2204,7 @@ ADDQ $4*SIZE, C1; .L22_loopE: TEST $2, bm; JLE .L23_loopE; -.align 16 +ALIGN_4 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2234,7 +2235,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 16 +ALIGN_4 .L231_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2274,7 +2275,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 16 +ALIGN_4 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2282,7 +2283,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 16 +ALIGN_4 .L232_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2310,7 +2311,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 16 +ALIGN_4 .L233_bodyB: LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; @@ -2356,7 +2357,7 @@ ADDQ $2*SIZE, C1; .L23_loopE: TEST $1, bm; JLE .L24_loopE; -.align 16 +ALIGN_4 .L24_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2386,7 +2387,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L241_loopE; -.align 16 +ALIGN_4 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2419,7 +2420,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L242_loopE; -.align 16 +ALIGN_4 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2440,7 +2441,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L243_loopE; -.align 16; +ALIGN_4; .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; @@ -2491,7 +2492,7 @@ LEAQ (C, ldc, 4), C; .L20_loopE: TEST $2, bn; JLE .L30_loopE; -.align 16 +ALIGN_4 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2503,7 +2504,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L31_loopE; -.align 16 +ALIGN_4 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2536,7 +2537,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 16 +ALIGN_4 .L311_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2612,7 +2613,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 16 +ALIGN_4 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2620,7 +2621,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 16 +ALIGN_4 .L312_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2666,7 +2667,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 16 +ALIGN_4 .L313_bodyB: LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2731,11 +2732,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L31_bodyB; -.align 16 +ALIGN_4 .L31_loopE: TEST $4, bm; JLE .L32_loopE; -.align 16 +ALIGN_4 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2766,7 +2767,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 16 +ALIGN_4 .L321_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2806,7 +2807,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 16 +ALIGN_4 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2814,7 +2815,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 16 +ALIGN_4 .L322_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2842,7 +2843,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 16 +ALIGN_4 .L323_bodyB: LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; @@ -2887,7 +2888,7 @@ ADDQ $4*SIZE, C1; .L32_loopE: TEST $2, bm; JLE .L33_loopE; -.align 16 +ALIGN_4 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -2920,7 +2921,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 16 +ALIGN_4 .L331_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 @@ -2943,7 +2944,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 16 +ALIGN_4 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2951,7 +2952,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 16 +ALIGN_4 .L332_bodyB: LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 @@ -2972,7 +2973,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 16 +ALIGN_4 .L333_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3031,7 +3032,7 @@ ADDQ $2*SIZE, C1; .L33_loopE: TEST $1, bm; JLE .L34_loopE; -.align 16 +ALIGN_4 .L34_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3062,7 +3063,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L341_loopE; -.align 16 +ALIGN_4 .L341_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3104,7 +3105,7 @@ addq $4*SIZE, ptrba; addq $8*SIZE, ptrbb; decq k; jg .L341_bodyB; -.align 16 +ALIGN_4 .L341_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3112,7 +3113,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L342_loopE; -.align 16 +ALIGN_4 .L342_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3140,7 +3141,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L343_loopE; -.align 16 +ALIGN_4 .L343_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3189,7 +3190,7 @@ LEAQ (C, ldc, 2), C; .L30_loopE: TEST $1, bn; JLE .L40_loopE; -.align 16 +ALIGN_4 .L40_bodyB: #if defined(TRMMKERNEL)&&defined(LEFT) MOVQ OFFSET, %rax; @@ -3200,7 +3201,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $3, i; JLE .L41_loopE; -.align 16 +ALIGN_4 .L41_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3230,7 +3231,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L411_loopE; -.align 16 +ALIGN_4 .L411_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3256,7 +3257,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L411_bodyB; -.align 16 +ALIGN_4 .L411_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3264,7 +3265,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L412_loopE; -.align 16 +ALIGN_4 .L412_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3285,7 +3286,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L413_loopE; -.align 16 +ALIGN_4 .L413_bodyB: LD_SY 0*SIZE(ptrba), yvec0; BROAD_SY 0*SIZE(ptrbb), yvec1; @@ -3329,11 +3330,11 @@ ADDQ $8, kk; ADDQ $8*SIZE, C0; DECQ i; JG .L41_bodyB; -.align 16 +ALIGN_4 .L41_loopE: TEST $4, bm; JLE .L42_loopE; -.align 16 +ALIGN_4 .L42_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb, ptrbb; @@ -3362,7 +3363,7 @@ MOVQ %rax, kkk #endif SARQ $2, k; JLE .L421_loopE; -.align 16 +ALIGN_4 .L421_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3387,7 +3388,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; JG .L421_bodyB; -.align 16 +ALIGN_4 .L421_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3395,7 +3396,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L422_loopE; -.align 16 +ALIGN_4 .L422_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3416,7 +3417,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L423_loopE; -.align 16 +ALIGN_4 .L423_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; @@ -3451,7 +3452,7 @@ ADDQ $4*SIZE, C0; .L42_loopE: TEST $2, bm; JLE .L43_loopE; -.align 16 +ALIGN_4 .L43_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3481,7 +3482,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L431_loopE; -.align 16 +ALIGN_4 .L431_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3518,7 +3519,7 @@ addq $8*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L431_bodyB; -.align 16 +ALIGN_4 .L431_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3526,7 +3527,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L432_loopE; -.align 16 +ALIGN_4 .L432_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3553,7 +3554,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L433_loopE; -.align 16 +ALIGN_4 .L433_bodyB: movss 0*SIZE(ptrba), xvec0; movss 1*SIZE(ptrba), xvec1; @@ -3592,7 +3593,7 @@ addq $2*SIZE, C0; .L43_loopE: TEST $1, bm; JLE .L44_loopE; -.align 16 +ALIGN_4 .L44_bodyB: #if !defined(TRMMKERNEL) || (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) || (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bb, ptrbb; @@ -3621,7 +3622,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L441_loopE; -.align 16 +ALIGN_4 .L441_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3646,7 +3647,7 @@ addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; JG .L441_bodyB; -.align 16 +ALIGN_4 .L441_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3654,7 +3655,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L442_loopE; -.align 16 +ALIGN_4 .L442_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; @@ -3675,7 +3676,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L443_loopE; -.align 16 +ALIGN_4 .L443_bodyB: movss 0*SIZE(ptrba), xvec0; movss 0*SIZE(ptrbb), xvec1; diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S index 34abbb529..f6f9f707f 100644 --- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -145,6 +145,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define JMP jmp #define NOP #define XOR xorpd +#undef MOVQ +#define MOVQ movq #define XOR_SY vxorps #define XOR_DY vxorpd @@ -297,7 +299,7 @@ movq %r11, kk; MOVQ bn,j; SARQ $2,j; # Rn = 4 JLE .L0_loopE; -.align 32; +ALIGN_5; .L0_bodyB:; #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -312,7 +314,7 @@ MOVQ ba,ptrba; MOVQ bm,i; SARQ $2,i; # Rm = 4 JLE .L1_loopE; -.align 32; +ALIGN_5; .L1_bodyB:; #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -361,7 +363,7 @@ MOVQ %rax, kkk; #endif SARQ $2,k; # Unroll 4 times JLE .L2_loopE; -.align 32; +ALIGN_5; .L2_bodyB:; #### Computing kernel #### @@ -584,7 +586,7 @@ ADD2_DY yvec6, yvec12, yvec12; ADD2_DY yvec7, yvec8, yvec8; DECQ k; JG .L2_bodyB; -.align 64; +ALIGN_5 .L2_loopE:; #ifndef TRMMKERNEL TEST $2, bk; @@ -592,7 +594,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L3_loopE; -.align 64 +ALIGN_5 .L3_bodyB: #### Unroll time 1 #### LD_DY 4*SIZE(ptrba), yvec1; @@ -710,7 +712,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L4_loopE; -.align 64 +ALIGN_5 .L4_loopB:; #### Unroll time 1 #### PREFETCH0 PRESIZE*SIZE(ptrba); @@ -852,7 +854,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L4_loopEx; -.align 32 +ALIGN_5 #### Store Back #### EXTRA_DY $1,yvec15,xvec7; EXTRA_DY $1,yvec14,xvec6; @@ -912,7 +914,7 @@ ADDQ $8*SIZE,C1; DECQ i; JG .L1_bodyB; JMP .L1_loopE; -.align 32 +ALIGN_5 .L4_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1024,11 +1026,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L1_bodyB; -.align 32; +ALIGN_5; .L1_loopE:; TEST $2, bm; JLE .L5_loopE; -.align 32 +ALIGN_5 .L5_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1060,7 +1062,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L7_loopE; -.align 32 +ALIGN_5 .L7_bodyB: #### Compute kernel #### #### Unroll times 1 #### @@ -1194,7 +1196,7 @@ ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L7_bodyB; -.align 32 +ALIGN_5 .L7_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1202,7 +1204,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L8_loopE; -.align 32 +ALIGN_5 .L8_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -1276,7 +1278,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L9_loopE; -.align 32 +ALIGN_5 .L9_bodyB: #### Unroll times 1 #### LD_DY 0*SIZE(ptrba), yvec0; @@ -1364,7 +1366,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L9_loopEx; -.align 32 +ALIGN_5 #### Writing back #### EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1401,7 +1403,7 @@ ADDQ $2, kk; ADDQ $4*SIZE, C0; ADDQ $4*SIZE, C1; JMP .L5_loopE; -.align 32 +ALIGN_5 .L9_loopEx: EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; @@ -1466,7 +1468,7 @@ ADDQ $4*SIZE, C1; .L5_loopE: TEST $1, bm; JLE .L6_loopE; -.align 32 +ALIGN_5 .L6_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1496,7 +1498,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L10_loopE; -.align 32 +ALIGN_5 .L10_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1570,7 +1572,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; JG .L10_bodyB; -.align 32 +ALIGN_5 .L10_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1578,7 +1580,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L11_loopE; -.align 32 +ALIGN_5 .L11_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1624,7 +1626,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L12_loopE; -.align 32 +ALIGN_5 .L12_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1722,11 +1724,11 @@ LEAQ (C,ldc,4),C; .L0_bodyE:; DECQ j; JG .L0_bodyB; -.align 32; +ALIGN_5; .L0_loopE:; TEST $2, bn; JLE .L20_loopE; -.align 32 +ALIGN_5 .L20_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -1738,7 +1740,7 @@ MOVQ ba, ptrba; MOVQ bm, i; SARQ $2, i; JLE .L21_loopE; -.align 32 +ALIGN_5 .L21_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -1770,7 +1772,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L211_loopE; -.align 32 +ALIGN_5 .L211_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1891,7 +1893,7 @@ ADD2_DY yvec7, yvec12, yvec12; ADDQ $32*SIZE, ptrba; DECQ k; JG .L211_bodyB; -.align 32 +ALIGN_5 .L211_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -1899,7 +1901,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L212_loopE; -.align 32 +ALIGN_5 .L212_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -1969,7 +1971,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L213_loopE; -.align 32 +ALIGN_5 .L213_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2058,7 +2060,7 @@ MOVQ C0, %rax; OR ldc, %rax; TEST $15, %rax; JNE .L213_loopEx; -.align 32 +ALIGN_5 #### Writing back #### #ifndef TRMMKERNEL ADD_DX 0*SIZE(C0),xvec15; @@ -2093,7 +2095,7 @@ ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; JMP .L21_loopE; -.align 32 +ALIGN_5 .L213_loopEx: #ifndef TRMMKERNEL LDL_DX 0*SIZE(C0), xvec0; @@ -2153,11 +2155,11 @@ ADDQ $8*SIZE, C0; ADDQ $8*SIZE, C1; DECQ i; JG .L21_bodyB; -.align 32 +ALIGN_5 .L21_loopE: TEST $2, bm; JLE .L22_loopE; -.align 32 +ALIGN_5 .L22_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2187,7 +2189,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L221_loopE; -.align 32 +ALIGN_5 .L221_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2268,7 +2270,7 @@ ADD2_DY yvec6, yvec13, yvec13; ADDQ $16*SIZE, ptrba; DECQ k; JG .L221_bodyB; -.align 32 +ALIGN_5 .L221_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2276,7 +2278,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L222_loopE; -.align 32 +ALIGN_5 .L222_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2325,7 +2327,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L223_loopE; -.align 32 +ALIGN_5 .L223_bodyB: #### Unroll time 1 #### EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2419,7 +2421,7 @@ ADDQ $4*SIZE, C1; .L22_loopE: TEST $1, bm; JLE .L23_loopE; -.align 32 +ALIGN_5 .L23_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2448,7 +2450,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L231_loopE; -.align 32 +ALIGN_5 .L231_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2498,7 +2500,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L231_bodyB; -.align 32 +ALIGN_5 .L231_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2506,7 +2508,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L232_loopE; -.align 32 +ALIGN_5 .L232_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2540,7 +2542,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L233_loopE; -.align 32 +ALIGN_5 .L233_bodyB: LD_DY 0*SIZE(ptrba), yvec0; #### A1r A1i A2r A2i EDUP_DY 0*SIZE(ptrbb), yvec2; @@ -2614,7 +2616,7 @@ LEAQ (C, ldc, 2), C; .L20_loopE: TEST $1, bn; JLE .L30_loopE; -.align 32 +ALIGN_5 .L30_bodyB: #if defined(TRMMKERNEL) && defined(LEFT) MOVQ OFFSET, %rax; @@ -2625,7 +2627,7 @@ MOVQ C, C0; MOVQ bm, i; SARQ $2, i; JLE .L31_loopE; -.align 32 +ALIGN_5 .L31_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2655,7 +2657,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L311_loopE; -.align 32 +ALIGN_5 .L311_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2732,7 +2734,7 @@ ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L311_bodyB; -.align 32 +ALIGN_5 .L311_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2740,7 +2742,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L312_loopE; -.align 32 +ALIGN_5 .L312_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2787,7 +2789,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L313_loopE; -.align 32 +ALIGN_5 .L313_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2877,11 +2879,11 @@ ADDQ $4, kk; ADDQ $8*SIZE, C0; DECQ i; JG .L31_bodyB; -.align 32 +ALIGN_5 .L31_loopE: TEST $2, bm; JLE .L32_loopE; -.align 32 +ALIGN_5 .L32_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -2910,7 +2912,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L321_loopE; -.align 32 +ALIGN_5 .L321_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2951,7 +2953,7 @@ ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L321_bodyB; -.align 32 +ALIGN_5 .L321_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -2959,7 +2961,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L322_loopE; -.align 32 +ALIGN_5 .L322_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -2988,7 +2990,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L323_loopE; -.align 32 +ALIGN_5 .L323_bodyB: LD_DY 0*SIZE(ptrba), yvec0; BROAD_DY 0*SIZE(ptrbb), yvec2; @@ -3049,7 +3051,7 @@ ADDQ $4*SIZE, C0; .L32_loopE: TEST $1, bm; JLE .L33_loopE; -.align 32 +ALIGN_5 .L33_bodyB: #if !defined(TRMMKERNEL)||(defined(TRMMKERNEL)&&defined(LEFT)&&defined(TRANSA))||(defined(TRMMKERNEL)&&!defined(LEFT)&&!defined(TRANSA)) MOVQ bb,ptrbb; @@ -3078,7 +3080,7 @@ MOVQ %rax, kkk; #endif SARQ $2, k; JLE .L331_loopE; -.align 32 +ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3123,7 +3125,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L331_bodyB; -.align 32 +ALIGN_5 .L331_loopE: #ifndef TRMMKERNEL TEST $2, bk; @@ -3131,7 +3133,7 @@ TEST $2, bk; TEST $2, kkk; #endif JLE .L332_loopE; -.align 32 +ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; @@ -3162,7 +3164,7 @@ TEST $1, bk; TEST $1, kkk; #endif JLE .L333_loopE; -.align 32 +ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; From 6cfcb54a2810b4607f9b9353e275345c2d64f27f Mon Sep 17 00:00:00 2001 From: wangqian Date: Wed, 20 Jun 2012 07:38:39 +0800 Subject: [PATCH 32/46] Fixed align problem in S and C precision GEMM kernels. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 2 +- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 56ebee120..5987b8e61 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -3578,7 +3578,7 @@ ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align +ALIGN_5 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 4d16a60d0..23eda3af8 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -2412,7 +2412,7 @@ ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; JG .L241_bodyB; -.align +ALIGN_4 .L241_loopE: #ifndef TRMMKERNEL TEST $2, bk; From 88c272f6a739039460afbca3e47b55cd3555f585 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Wed, 20 Jun 2012 09:20:20 +0800 Subject: [PATCH 33/46] Refs #83. Added the missing ALIGN_5 macro on Mac OSX. However, it still exists SEGFAULT bug. --- common_x86_64.h | 1 + 1 file changed, 1 insertion(+) diff --git a/common_x86_64.h b/common_x86_64.h index 7b6d11f7d..19b0ac53c 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -425,6 +425,7 @@ REALNAME: #define ALIGN_2 .align 2 #define ALIGN_3 .align 3 #define ALIGN_4 .align 4 +#define ALIGN_5 .align 5 #define ffreep fstp #endif From b8b922d334568ea2cf5d7c471be187715ddfb33f Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Wed, 20 Jun 2012 11:07:36 +0800 Subject: [PATCH 34/46] Fixed #106. Use fetch instead of curl on FreeBSD. --- Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 905d686a2..796217291 100644 --- a/Makefile +++ b/Makefile @@ -257,12 +257,16 @@ LAPACK_URL=http://www.netlib.org/lapack/lapack-3.4.1.tgz lapack-3.4.1.tgz : ifndef NOFORTRAN #http://stackoverflow.com/questions/7656425/makefile-ifeq-logical-or -ifeq ($(OSNAME), $(filter $(OSNAME),Darwin FreeBSD NetBSD)) +ifeq ($(OSNAME), $(filter $(OSNAME),Darwin NetBSD)) curl -O $(LAPACK_URL) +else +ifeq ($(OSNAME), FreeBSD) + fetch $(LAPACK_URL) else wget $(LAPACK_URL) endif endif +endif large.tgz : ifndef NOFORTRAN From d34fce56e4a980fefe4ddafe5d371798ad948b59 Mon Sep 17 00:00:00 2001 From: wangqian Date: Wed, 20 Jun 2012 19:53:18 +0800 Subject: [PATCH 35/46] Refs #83 Fixed S/DGEMM calling conventions bug on windows. --- kernel/x86_64/dgemm_kernel_4x8_sandy.S | 67 ++++++++++++++------------ kernel/x86_64/sgemm_kernel_8x8_sandy.S | 1 + 2 files changed, 36 insertions(+), 32 deletions(-) diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index c98879d7c..603552464 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -162,7 +162,9 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ST_SX movaps #define ST_DX movapd #define STL_DX movlpd +#define STL_DY vmovlpd #define STH_DX movhpd +#define STH_DY vmovhpd #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup @@ -242,6 +244,7 @@ movq %r15, 40(%rsp); #ifdef TRMMKERNEL movq old_offset, %r11 #endif + movaps %xmm3, %xmm0 #else movq old_ldc, ldc @@ -660,10 +663,10 @@ LDL_DY 2*SIZE(C1), xvec5, xvec5; LDH_DY 3*SIZE(C1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec15, 0*SIZE(C0); -STH_DX xvec15, 1*SIZE(C0); -STL_DX xvec7, 2*SIZE(C1); -STH_DX xvec7, 3*SIZE(C1); +STL_DY xvec15, 0*SIZE(C0); +STH_DY xvec15, 1*SIZE(C0); +STL_DY xvec7, 2*SIZE(C1); +STH_DY xvec7, 3*SIZE(C1); EXTRA_DY $1, yvec14, xvec4; #ifndef TRMMKERNEL @@ -674,10 +677,10 @@ LDL_DY 6*SIZE(C1), xvec2, xvec2; LDH_DY 7*SIZE(C1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec14, 4*SIZE(C0); -STH_DX xvec14, 5*SIZE(C0); -STL_DX xvec4, 6*SIZE(C1); -STH_DX xvec4, 7*SIZE(C1); +STL_DY xvec14, 4*SIZE(C0); +STH_DY xvec14, 5*SIZE(C0); +STL_DY xvec4, 6*SIZE(C1); +STH_DY xvec4, 7*SIZE(C1); EXTRA_DY $1, yvec13, xvec7; #ifndef TRMMKERNEL @@ -688,10 +691,10 @@ LDL_DY 2*SIZE(C1, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C1, ldc, 1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec13, 0*SIZE(C0, ldc, 1); -STH_DX xvec13, 1*SIZE(C0, ldc, 1); -STL_DX xvec7, 2*SIZE(C1, ldc, 1); -STH_DX xvec7, 3*SIZE(C1, ldc, 1); +STL_DY xvec13, 0*SIZE(C0, ldc, 1); +STH_DY xvec13, 1*SIZE(C0, ldc, 1); +STL_DY xvec7, 2*SIZE(C1, ldc, 1); +STH_DY xvec7, 3*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL @@ -702,10 +705,10 @@ LDL_DY 6*SIZE(C1, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C1, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec12, 4*SIZE(C0, ldc, 1); -STH_DX xvec12, 5*SIZE(C0, ldc ,1); -STL_DX xvec4, 6*SIZE(C1, ldc, 1); -STH_DX xvec4, 7*SIZE(C1, ldc, 1); +STL_DY xvec12, 4*SIZE(C0, ldc, 1); +STH_DY xvec12, 5*SIZE(C0, ldc ,1); +STL_DY xvec4, 6*SIZE(C1, ldc, 1); +STH_DY xvec4, 7*SIZE(C1, ldc, 1); EXTRA_DY $1, yvec11, xvec7; #ifndef TRMMKERNEL @@ -716,10 +719,10 @@ LDL_DY 2*SIZE(C0), xvec5, xvec5; LDH_DY 3*SIZE(C0), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec11, 0*SIZE(C1); -STH_DX xvec11, 1*SIZE(C1); -STL_DX xvec7, 2*SIZE(C0); -STH_DX xvec7, 3*SIZE(C0); +STL_DY xvec11, 0*SIZE(C1); +STH_DY xvec11, 1*SIZE(C1); +STL_DY xvec7, 2*SIZE(C0); +STH_DY xvec7, 3*SIZE(C0); EXTRA_DY $1, yvec10, xvec4; #ifndef TRMMKERNEL @@ -730,10 +733,10 @@ LDL_DY 6*SIZE(C0), xvec2, xvec2; LDH_DY 7*SIZE(C0), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec10, 4*SIZE(C1); -STH_DX xvec10, 5*SIZE(C1); -STL_DX xvec4, 6*SIZE(C0); -STH_DX xvec4, 7*SIZE(C0); +STL_DY xvec10, 4*SIZE(C1); +STH_DY xvec10, 5*SIZE(C1); +STL_DY xvec4, 6*SIZE(C0); +STH_DY xvec4, 7*SIZE(C0); EXTRA_DY $1, yvec9, xvec7; #ifndef TRMMKERNEL @@ -744,10 +747,10 @@ LDL_DY 2*SIZE(C0, ldc, 1), xvec5, xvec5; LDH_DY 3*SIZE(C0, ldc ,1), xvec5, xvec5; ADD_DY xvec5, xvec7, xvec7; #endif -STL_DX xvec9, 0*SIZE(C1, ldc, 1); -STH_DX xvec9, 1*SIZE(C1, ldc, 1); -STL_DX xvec7, 2*SIZE(C0, ldc, 1); -STH_DX xvec7, 3*SIZE(C0, ldc, 1); +STL_DY xvec9, 0*SIZE(C1, ldc, 1); +STH_DY xvec9, 1*SIZE(C1, ldc, 1); +STL_DY xvec7, 2*SIZE(C0, ldc, 1); +STH_DY xvec7, 3*SIZE(C0, ldc, 1); EXTRA_DY $1, yvec8, xvec4; #ifndef TRMMKERNEL @@ -758,10 +761,10 @@ LDL_DY 6*SIZE(C0, ldc, 1), xvec2, xvec2; LDH_DY 7*SIZE(C0, ldc, 1), xvec2, xvec2; ADD_DY xvec2, xvec4, xvec4; #endif -STL_DX xvec8, 4*SIZE(C1, ldc, 1); -STH_DX xvec8, 5*SIZE(C1, ldc, 1); -STL_DX xvec4, 6*SIZE(C0, ldc, 1); -STH_DX xvec4, 7*SIZE(C0, ldc, 1); +STL_DY xvec8, 4*SIZE(C1, ldc, 1); +STH_DY xvec8, 5*SIZE(C1, ldc, 1); +STL_DY xvec4, 6*SIZE(C0, ldc, 1); +STH_DY xvec4, 7*SIZE(C0, ldc, 1); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 23eda3af8..59458effe 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -251,6 +251,7 @@ movq %r15, 40(%rsp); #ifdef TRMMKERNEL movq old_offset, %r11 #endif + movaps %xmm3, %xmm0 #else movq old_ldc, ldc From 037d995c4d8c2c5281d9141ce2905f44cc908ac2 Mon Sep 17 00:00:00 2001 From: Zaheer Chothia Date: Tue, 19 Jun 2012 22:05:32 +0200 Subject: [PATCH 36/46] Fixed noisy warning with Clang ../common_thread.h:138:24: warning: equality comparison with extraneous parentheses [-Wparentheses-equality] if ((blas_cpu_number == 1) ~~~~~~~~~~~~~~~~^~~~ ../common_thread.h:138:24: note: remove extraneous parentheses around the comparison to silence this warning if ((blas_cpu_number == 1) ~ ^ ~ ../common_thread.h:138:24: note: use '=' to turn this equality comparison into an assignment if ((blas_cpu_number == 1) ^~ = --- common_thread.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/common_thread.h b/common_thread.h index dc963a635..97e060976 100644 --- a/common_thread.h +++ b/common_thread.h @@ -135,7 +135,7 @@ static __inline int num_cpu_avail(int level) { int openmp_nthreads=0; #endif - if ((blas_cpu_number == 1) + if (blas_cpu_number == 1 #ifdef USE_OPENMP || omp_in_parallel() From fda5e0da8a0a43234ef1f70e719f4a5dd60fad0d Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Thu, 21 Jun 2012 08:25:52 +0800 Subject: [PATCH 37/46] Refs #83. Clang 3.1 works fine on Sandy Bridge Mac OSX. Edit the document. --- README | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README b/README index 6372e96bd..b3f1baa79 100644 --- a/README +++ b/README @@ -34,8 +34,10 @@ Please read GotoBLAS_01Readme.txt Additional support CPU: x86_64: Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. + Intel Sandy Bridge MIPS64: - ICT Loongson 3A //Level 3 BLAS subroutines are optimized. + ICT Loongson 3A + ICT Loongson 3B (Experimental) 4.Usages Link with libopenblas.a or -lopenblas for shared library. @@ -70,10 +72,10 @@ OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas 8.ChangeLog Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. -9.Known Issues -* The number of CPUs/Cores should less than or equal to 8*sizeof(unsigned long). On 64 bits, the limit - is 64. On 32 bits, it is 32. -* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. I don't think this is a bug in OpenBLAS. +9.Troubleshooting +* Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. +* The number of CPUs/Cores should less than or equal to 256. +* On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. 10. Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). From 157cc5444981c60bd72e924eee0663fb96c6de48 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 08:04:58 +0800 Subject: [PATCH 38/46] Update git ignore file. --- .gitignore | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/.gitignore b/.gitignore index 6cfc5b3c1..118205ca2 100644 --- a/.gitignore +++ b/.gitignore @@ -1,16 +1,23 @@ *.obj *.lib *.dll +*.dylib *.def *.o lapack-3.1.1 lapack-3.1.1.tgz +lapack-3.4.1 +lapack-3.4.1.tgz *.so *.a .svn *~ +lib.grd +nohup.out config.h Makefile.conf +Makefile.conf_last +config_last.h getarch getarch_2nd utest/openblas_utest From fe809c39f9b3696a45531734e85edd9ff5eb93ff Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 08:22:53 +0800 Subject: [PATCH 39/46] Update the docs for 0.2.0 --- Makefile.rule | 2 +- README => README.md | 80 ++++++++++++++++++++++++++------------------- 2 files changed, 47 insertions(+), 35 deletions(-) rename README => README.md (64%) diff --git a/Makefile.rule b/Makefile.rule index 56cd63540..299273773 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.1.1 +VERSION = 0.2.0 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/README b/README.md similarity index 64% rename from README rename to README.md index b3f1baa79..000bc4158 100644 --- a/README +++ b/README.md @@ -1,34 +1,41 @@ -OpenBLAS Readme +# OpenBLAS -1.Introduction -OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS.(http://www.rdcps.ac.cn) -Please read the documents on OpenBLAS wiki pages(http://github.com/xianyi/OpenBLAS/wiki). +## Introduction +OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS . -2.Intallation +Please read the documents on OpenBLAS wiki pages. + +## Intallation Download from project homepage. http://xianyi.github.com/OpenBLAS/ Or, check out codes from git://github.com/xianyi/OpenBLAS.git -1)Normal compile - (a) type "make" to detect the CPU automatically. +### Normal compile + * type "make" to detect the CPU automatically. or - (b) type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. + * type "make TARGET=xxx" to set target CPU, e.g. "make TARGET=NEHALEM". The full target list is in file TargetList.txt. -2)Cross compile +### Cross compile Please set CC and FC with the cross toolchains. Then, set HOSTCC with your host C compiler. At last, set TARGET explicitly. -examples: +Examples: + On X86 box, compile this library for loongson3a CPU. -make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A -3)Debug version -make DEBUG=1 + make BINARY=64 CC=mips64el-unknown-linux-gnu-gcc FC=mips64el-unknown-linux-gnu-gfortran HOSTCC=gcc TARGET=LOONGSON3A + +### Debug version + + make DEBUG=1 + +### Intall to the directory (Optional) + +Example: + + make install PREFIX=your_installation_directory -4)Intall to the directory (Optional) -e.g. -make install PREFIX=your_installation_directory The default directory is /opt/OpenBLAS -3.Support CPU & OS +## Support CPU & OS Please read GotoBLAS_01Readme.txt Additional support CPU: @@ -39,45 +46,50 @@ MIPS64: ICT Loongson 3A ICT Loongson 3B (Experimental) -4.Usages +## Usages Link with libopenblas.a or -lopenblas for shared library. -4.1 Set the number of threads with environment variables. for example, -export OPENBLAS_NUM_THREADS=4 +### Set the number of threads with environment variables. + +Examples: + export OPENBLAS_NUM_THREADS=4 + or -export GOTO_NUM_THREADS=4 + + export GOTO_NUM_THREADS=4 + or -export OMP_NUM_THREADS=4 + + export OMP_NUM_THREADS=4 The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. -4.2 Set the number of threads with calling functions. for example, -void goto_set_num_threads(int num_threads); -or -void openblas_set_num_threads(int num_threads); +### Set the number of threads with calling functions. + +Examples: + void goto_set_num_threads(int num_threads); + void openblas_set_num_threads(int num_threads); If you compile this lib with USE_OPENMP=1, you should use the above functions, too. -5.Report Bugs +## Report Bugs Please add a issue in https://github.com/xianyi/OpenBLAS/issues -6.To-Do List: -Optimization on ICT Loongson 3A CPU - -7.Contact +## Contact OpenBLAS users mailing list: http://list.rdcps.ac.cn/mailman/listinfo/openblas -8.ChangeLog +## ChangeLog Please see Changelog.txt to obtain the differences between GotoBLAS2 1.13 BSD version. -9.Troubleshooting +## Troubleshooting +* Please use gcc version 4.6 and above to compile Sandy Bridge AVX kernels on Linux/MingW/BSD. * Please use Clang version 3.1 and above to compile the library on Sandy Bridge microarchitecture. The Clang 3.0 will generate the wrong AVX binary code. * The number of CPUs/Cores should less than or equal to 256. * On Loongson 3A. make test would be failed because of pthread_create error. The error code is EAGAIN. However, it will be OK when you run the same testcase on shell. -10. Specification of Git Branches +## Specification of Git Branches We used the git branching model in this article (http://nvie.com/posts/a-successful-git-branching-model/). Now, there are 4 branches in github.com. * The master branch. This a main branch to reflect a production-ready state. From a6214c057e6b06783e08c3b450a24c3f86a63c31 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 08:34:40 +0800 Subject: [PATCH 40/46] Modified readme. --- README.md | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 000bc4158..80116c658 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,12 @@ ## Introduction OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenBLAS is an open source project supported by Lab of Parallel Software and Computational Science, ISCAS . -Please read the documents on OpenBLAS wiki pages. +Please read the documents on OpenBLAS wiki pages . ## Intallation Download from project homepage. http://xianyi.github.com/OpenBLAS/ -Or, -check out codes from git://github.com/xianyi/OpenBLAS.git + +Or, check out codes from git://github.com/xianyi/OpenBLAS.git ### Normal compile * type "make" to detect the CPU automatically. or @@ -38,13 +38,15 @@ The default directory is /opt/OpenBLAS ## Support CPU & OS Please read GotoBLAS_01Readme.txt -Additional support CPU: -x86_64: - Intel Xeon 56xx (Westmere) //Used GotoBLAS2 Nehalem codes. - Intel Sandy Bridge -MIPS64: - ICT Loongson 3A - ICT Loongson 3B (Experimental) +### Additional support CPU: + +#### x86/x86-64: +* Intel Xeon 56xx (Westmere). Used GotoBLAS2 Nehalem codes. +* Intel Sandy Bridge. Optimized Level-3 BLAS with AVX on x86-64. +* AMD Bobcat. Used GotoBLAS2 Barcelona codes. +#### MIPS64: +* ICT Loongson 3A. Optimized Level-3 BLAS and the part of Level-1,2. +* ICT Loongson 3B (Experimental) ## Usages Link with libopenblas.a or -lopenblas for shared library. @@ -52,6 +54,7 @@ Link with libopenblas.a or -lopenblas for shared library. ### Set the number of threads with environment variables. Examples: + export OPENBLAS_NUM_THREADS=4 or @@ -69,7 +72,9 @@ If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS enviro ### Set the number of threads with calling functions. Examples: + void goto_set_num_threads(int num_threads); + void openblas_set_num_threads(int num_threads); If you compile this lib with USE_OPENMP=1, you should use the above functions, too. From 544af1efec5602e7413c1211dd0deb92d97b5b26 Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 09:35:19 +0800 Subject: [PATCH 41/46] Correct the error in readme --- README.md | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 80116c658..a13e069ec 100644 --- a/README.md +++ b/README.md @@ -41,12 +41,19 @@ Please read GotoBLAS_01Readme.txt ### Additional support CPU: #### x86/x86-64: -* Intel Xeon 56xx (Westmere). Used GotoBLAS2 Nehalem codes. -* Intel Sandy Bridge. Optimized Level-3 BLAS with AVX on x86-64. -* AMD Bobcat. Used GotoBLAS2 Barcelona codes. +- **Intel Xeon 56xx (Westmere)**: Used GotoBLAS2 Nehalem codes. +- **Intel Sandy Bridge**: Optimized Level-3 BLAS with AVX on x86-64. +- **AMD Bobcat**: Used GotoBLAS2 Barcelona codes. + #### MIPS64: -* ICT Loongson 3A. Optimized Level-3 BLAS and the part of Level-1,2. -* ICT Loongson 3B (Experimental) +- **ICT Loongson 3A**: Optimized Level-3 BLAS and the part of Level-1,2. +- **ICT Loongson 3B**: Experimental + +### Support OS: +- **GNU/Linux** +- **MingWin/Windows**: Please read . +- **Darwin/Mac OS X**: Experimental. Although GotoBLAS2 supports Darwin, we are the beginner on Mac OS X. +- **FreeBSD**: Supportted by community. We didn't test the library on this OS. ## Usages Link with libopenblas.a or -lopenblas for shared library. From 422359d09ac28b27bb652b303318485fb2c02cca Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Sat, 23 Jun 2012 11:32:43 +0800 Subject: [PATCH 42/46] Export openblas_set_num_threads in shared library. --- cblas.h | 4 ++++ common_interface.h | 2 ++ driver/others/Makefile | 4 ++-- driver/others/openblas_set_num_threads.c | 13 ++++++++++--- exports/gensymbol | 18 +++++++++++++----- 5 files changed, 31 insertions(+), 10 deletions(-) diff --git a/cblas.h b/cblas.h index f3708a994..ee8bf08b2 100644 --- a/cblas.h +++ b/cblas.h @@ -9,6 +9,10 @@ extern "C" { #include #include "common.h" +/*Set the number of threads on runtime.*/ +void openblas_set_num_threads(int num_threads); +void goto_set_num_threads(int num_threads); + #define CBLAS_INDEX size_t enum CBLAS_ORDER {CblasRowMajor=101, CblasColMajor=102}; diff --git a/common_interface.h b/common_interface.h index 898d91001..dbe0bb851 100644 --- a/common_interface.h +++ b/common_interface.h @@ -45,6 +45,8 @@ extern "C" { int BLASFUNC(xerbla)(char *, blasint *info, blasint); +void BLASFUNC(openblas_set_num_threads)(int *); + FLOATRET BLASFUNC(sdot) (blasint *, float *, blasint *, float *, blasint *); FLOATRET BLASFUNC(sdsdot)(blasint *, float *, float *, blasint *, float *, blasint *); diff --git a/driver/others/Makefile b/driver/others/Makefile index 75b552b65..2fdbb4a42 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -1,12 +1,12 @@ TOPDIR = ../.. include ../../Makefile.system -COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) +COMMONOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) c_abs.$(SUFFIX) z_abs.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) COMMONOBJS += slamch.$(SUFFIX) slamc3.$(SUFFIX) dlamch.$(SUFFIX) dlamc3.$(SUFFIX) ifdef SMP -COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) openblas_set_num_threads.$(SUFFIX) +COMMONOBJS += blas_server.$(SUFFIX) divtable.$(SUFFIX) blasL1thread.$(SUFFIX) ifndef NO_AFFINITY COMMONOBJS += init.$(SUFFIX) endif diff --git a/driver/others/openblas_set_num_threads.c b/driver/others/openblas_set_num_threads.c index 7ca3b7114..27de83ffc 100644 --- a/driver/others/openblas_set_num_threads.c +++ b/driver/others/openblas_set_num_threads.c @@ -1,5 +1,5 @@ /***************************************************************************** -Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS +Copyright (c) 2011,2012 Lab of Parallel Software and Computational Science,ISCAS All rights reserved. Redistribution and use in source and binary forms, with or without @@ -33,7 +33,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include "common.h" #ifdef SMP_SERVER -#ifdef OS_LINUX extern void openblas_set_num_threads(int num_threads) ; @@ -41,5 +40,13 @@ void NAME(int* num_threads){ openblas_set_num_threads(*num_threads); } -#endif +#else +//Single thread + +void openblas_set_num_threads(int num_threads) { +} + +void NAME(int* num_threads){ + +} #endif diff --git a/exports/gensymbol b/exports/gensymbol index dbd559473..61e7c8367 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -72,6 +72,14 @@ zgemm3m, cgemm3m, zsymm3m, csymm3m, zhemm3m, chemm3m, ); +@misc_no_underscore_objs = ( + openblas_set_num_threads, goto_set_num_threads, + ); + +@misc_underscore_objs = ( + openblas_set_num_threads, + ); + @lapackobjs = ( # These routines are provided by OpenBLAS. sgesv, dgesv, cgesv, zgesv, @@ -2660,11 +2668,11 @@ if ($ARGV[5] == 1) { #NO_LAPACK=1 - @underscore_objs = (@blasobjs); + @underscore_objs = (@blasobjs, @misc_underscore_objs); } elsif (-d "../lapack-3.1.1" || -d "../lapack-3.4.0" || -d "../lapack-3.4.1") { - @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2); + @underscore_objs = (@blasobjs, @lapackobjs, @lapackobjs2, @misc_underscore_objs); } else { - @underscore_objs = (@blasobjs, @lapackobjs); + @underscore_objs = (@blasobjs, @lapackobjs, @misc_underscore_objs); } if ($ARGV[3] == 1){ @underscore_objs = (@underscore_objs, @exblasobjs); }; @@ -2678,10 +2686,10 @@ if ($ARGV[1] eq "ia64"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[1] eq "MIPS"){ @underscore_objs = (@underscore_objs, @gemm3mobjs); }; if ($ARGV[4] == 0) { - @no_underscore_objs = (@cblasobjs); + @no_underscore_objs = (@cblasobjs, @misc_no_underscore_objs); }else{ #NO_CBLAS=1 - @no_underscore_objs = (); + @no_underscore_objs = (@misc_no_underscore_objs); } if ($ARGV[6] == 1) { #NO_LAPACKE=1 From 853d16ed7ec9169cf03ec024f5894e9a597c7da1 Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Sat, 23 Jun 2012 13:07:38 +0800 Subject: [PATCH 43/46] Added openblas_set_num_threads dummy function on Windows. We plan to implement this feature in next version. --- driver/others/blas_server_win32.c | 8 ++++++++ exports/gensymbol | 10 +++++++++- 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 6708509e1..c71e7c276 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -63,6 +63,14 @@ static blas_pool_t pool; static HANDLE blas_threads [MAX_CPU_NUMBER]; static DWORD blas_threads_id[MAX_CPU_NUMBER]; +void goto_set_num_threads(int num) +{ +} + +void openblas_set_num_threads(int num) +{ +} + static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){ if (!(mode & BLAS_COMPLEX)){ diff --git a/exports/gensymbol b/exports/gensymbol index 61e7c8367..e09a8b6ab 100644 --- a/exports/gensymbol +++ b/exports/gensymbol @@ -2759,6 +2759,10 @@ if ($ARGV[0] eq "aix"){ if ($ARGV[0] eq "win2k"){ print "EXPORTS\n"; $count = 1; + + #remove openblas_set_num_threads + @underscore_objs = grep /[^openblas_set_num_threads]/,@underscore_objs; + foreach $objs (@underscore_objs) { $uppercase = $objs; $uppercase =~ tr/[a-z]/[A-Z]/; @@ -2769,7 +2773,11 @@ if ($ARGV[0] eq "win2k"){ print "\t$uppercase=$objs", "_ \@", $count, "\n"; $count ++; } - + + #for openblas_set_num_threads + print "\topenblas_set_num_threads_=openblas_set_num_threads_ \@", $count, "\n"; + $count ++; + # if ($ARGV[4] == 0) { foreach $objs (@no_underscore_objs) { print "\t",$objs,"=$objs"," \@", $count, "\n"; From b39c51195b0ec09d17a0bcf345fcd7873f352acc Mon Sep 17 00:00:00 2001 From: Xianyi Zhang Date: Mon, 25 Jun 2012 14:29:17 +0800 Subject: [PATCH 44/46] Fixed the build bug about Sandy Bridge on 32-bit. We used Nehalem/Penryn codes on Sandy Bridge 32-bit. --- param.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/param.h b/param.h index 27aeeb6ac..5465c1cbd 100644 --- a/param.h +++ b/param.h @@ -928,14 +928,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #ifdef ARCH_X86 #define SGEMM_DEFAULT_UNROLL_M 4 -#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_M 2 #define QGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_M 2 #define ZGEMM_DEFAULT_UNROLL_M 1 #define XGEMM_DEFAULT_UNROLL_M 1 #define SGEMM_DEFAULT_UNROLL_N 4 -#define DGEMM_DEFAULT_UNROLL_N 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define QGEMM_DEFAULT_UNROLL_N 2 #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 From 857a0fa0df83cd3ff79a4047ad31a9f0f9e9f5da Mon Sep 17 00:00:00 2001 From: wangqian Date: Mon, 25 Jun 2012 19:00:37 +0800 Subject: [PATCH 45/46] Fixed the issue of mixing AVX and SSE codes in S/D/C/ZGEMM. --- kernel/x86_64/cgemm_kernel_4x8_sandy.S | 1827 ++++++++++++------------ kernel/x86_64/dgemm_kernel_4x8_sandy.S | 1033 +++++++------- kernel/x86_64/sgemm_kernel_8x8_sandy.S | 1587 ++++++++++---------- kernel/x86_64/zgemm_kernel_4x4_sandy.S | 358 +++-- 4 files changed, 2366 insertions(+), 2439 deletions(-) diff --git a/kernel/x86_64/cgemm_kernel_4x8_sandy.S b/kernel/x86_64/cgemm_kernel_4x8_sandy.S index 5987b8e61..5a5588089 100644 --- a/kernel/x86_64/cgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/cgemm_kernel_4x8_sandy.S @@ -150,79 +150,54 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MOVQ movq #define XOR_SY vxorps -#define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_SX vxorps #define LD_SY vmovaps -#define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd -#define LDL_SX movlps +#define LD_SX vmovaps +#define LDL_SX vmovlps #define LDL_SY vmovlps -#define LDH_SX movhps +#define LDH_SX vmovhps #define LDH_SY vmovhps #define ST_SY vmovaps -#define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd -#define STL_SX movlps +#define ST_SX vmovaps +#define STL_SX vmovlps #define STL_SY vmovlps -#define STH_SX movhps +#define STH_SX vmovhps #define STH_SY vmovhps #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup -#define EDUP_SX movsldup -#define ODUP_SX movshdup -#define EDUP_DY vmovddup +#define EDUP_SX vmovsldup +#define ODUP_SX vmovshdup #define ADD_SY vaddps -#define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd -#define SUB_DY vsubpd +#define ADD_SX vaddps #define SUB_SY vsubps -#define SUB_DX subpd -#define SUB_SX subps +#define SUB_SX vsubps -#define ADDSUB_DY vaddsubpd -#define ADDSUB_DX addsubpd #define ADDSUB_SY vaddsubps -#define ADDSUB_SX addsubps +#define ADDSUB_SX vaddsubps #define MUL_SY vmulps -#define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_SX vmulps #define SHUF_SY vperm2f128 -#define SHUF_DY vperm2f128 -#define SHUF_DX pshufd -#define SHUF_SX pshufd +#define SHUF_SX vpshufd #define VPERMILP_SY vpermilps #define VPERMILP_SX vpermilps -#define VPERMILP_DY vpermilpd #define BROAD_SY vbroadcastss -#define BROAD_DY vbroadcastsd #define BROAD_SX vbroadcastss -#define BROAD_DX movddup #define MOV_SY vmovaps -#define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_SX vmovaps #define REVS_SY vshufps -#define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_SX vshufps #define EXTRA_SY vextractf128 -#define EXTRA_DY vextractf128 #if defined(NN) || defined(NT) || defined(TN) || defined(TT) #define ADD1_SY ADD_SY @@ -289,6 +264,8 @@ movq old_offset, %r11; #endif #endif +vzeroupper + vmovlps %xmm0, MEMALPHA_R vmovlps %xmm1, MEMALPHA_I movq old_bm, bm @@ -1417,64 +1394,64 @@ REVS_SY $0xe4,yvec7,yvec9,yvec9; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec6; -LDH_SX 2*SIZE(C0), xvec6; -ADD_SX xvec6, xvec15; +LDL_SX 0*SIZE(C0), xvec6, xvec6; +LDH_SX 2*SIZE(C0), xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C1), xvec4; -LDH_SX 6*SIZE(C1), xvec4; -ADD_SX xvec4, xvec7; +LDL_SX 4*SIZE(C1), xvec4, xvec4; +LDH_SX 6*SIZE(C1), xvec4, xvec4; +ADD_SX xvec4, xvec7, xvec7; #endif STL_SX xvec7, 4*SIZE(C1); STH_SX xvec7, 6*SIZE(C1); EXTRA_SY $1, yvec13, xvec5; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0, ldc, 1), xvec4; -LDH_SX 2*SIZE(C0, ldc, 1), xvec4; -ADD_SX xvec4, xvec13; +LDL_SX 0*SIZE(C0, ldc, 1), xvec4, xvec4; +LDH_SX 2*SIZE(C0, ldc, 1), xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; #endif STL_SX xvec13, 0*SIZE(C0, ldc, 1); STH_SX xvec13, 2*SIZE(C0, ldc, 1); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C1, ldc, 1), xvec2; -LDH_SX 6*SIZE(C1, ldc, 1), xvec2; -ADD_SX xvec2, xvec5; +LDL_SX 4*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_SX 6*SIZE(C1, ldc, 1), xvec2, xvec2; +ADD_SX xvec2, xvec5, xvec5; #endif STL_SX xvec5, 4*SIZE(C1, ldc, 1); STH_SX xvec5, 6*SIZE(C1, ldc, 1); EXTRA_SY $1, yvec11, xvec3; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C1), xvec2; -ADD_SX xvec2, xvec11; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C1), xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C0), xvec0; -LDH_SX 6*SIZE(C0), xvec0; -ADD_SX xvec0, xvec3; +LDL_SX 4*SIZE(C0), xvec0, xvec0; +LDH_SX 6*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec3, xvec3; #endif STL_SX xvec3, 4*SIZE(C0); STH_SX xvec3, 6*SIZE(C0); EXTRA_SY $1, yvec9, xvec1; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1, ldc, 1), xvec0; -LDH_SX 2*SIZE(C1, ldc, 1), xvec0; -ADD_SX xvec0, xvec9; +LDL_SX 0*SIZE(C1, ldc, 1), xvec0, xvec0; +LDH_SX 2*SIZE(C1, ldc, 1), xvec0, xvec0; +ADD_SX xvec0, xvec9, xvec9; #endif STL_SX xvec9, 0*SIZE(C1, ldc, 1); STH_SX xvec9, 2*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL -LDL_SX 4*SIZE(C0, ldc, 1), xvec6; -LDH_SX 6*SIZE(C0, ldc, 1), xvec6; -ADD_SX xvec6, xvec1; +LDL_SX 4*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_SX 6*SIZE(C0, ldc, 1), xvec6, xvec6; +ADD_SX xvec6, xvec1, xvec1; #endif STL_SX xvec1, 4*SIZE(C0, ldc, 1); STH_SX xvec1, 6*SIZE(C0, ldc, 1); @@ -1533,122 +1510,122 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 16*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 24*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; @@ -1666,62 +1643,62 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 8*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -1737,32 +1714,32 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; # ar1, ai1, ar2, ai2 EDUP_SX 0*SIZE(ptrbb), xvec2; # br1, br1, br2, br2 SHUF_SX $0x4e, xvec2, xvec3; # br3, br3, br4, br4 -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD1_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD1_SX xvec5, xvec12, xvec12; SHUF_SX $0xb1, xvec0, xvec1; ODUP_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x4e, xvec2, xvec3; -MUL_SX xvec1, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec14, xvec14; ODUP_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -1770,29 +1747,29 @@ ADDQ $8*SIZE, ptrbb; #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec13, xvec7; +SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec12, xvec7; +SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; @@ -1800,16 +1777,16 @@ SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec13, xvec13; SHUF_SX $0xb1, xvec12, xvec12; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -1821,35 +1798,35 @@ BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; -MUL_SX xvec7, xvec13; -MUL_SX xvec6, xvec3; -ADDSUB_SX xvec3, xvec13; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec6, xvec3, xvec3; +ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; -MUL_SX xvec7, xvec12; -MUL_SX xvec6, xvec2; -ADDSUB_SX xvec2, xvec12; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec6, xvec2, xvec2; +ADDSUB_SX xvec2, xvec12, xvec12; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0, ldc, 1), xvec0; -LDL_SX 0*SIZE(C0, ldc, 1), xvec1; -LDH_SX 2*SIZE(C0), xvec1; -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C1, ldc, 1), xvec2; -LDL_SX 0*SIZE(C1, ldc, 1), xvec3; -LDH_SX 2*SIZE(C1), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0, ldc,1), xvec0, xvec0; +LDL_SX 0*SIZE(C0, ldc,1), xvec1, xvec1; +LDH_SX 2*SIZE(C0), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_SX 2*SIZE(C1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0, ldc, 1); @@ -1911,70 +1888,70 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 16*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 20*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 5*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 24*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 28*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 7*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; DECQ k; @@ -1992,36 +1969,36 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -2037,19 +2014,19 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec4; SHUF_SX $0xb1, xvec4, xvec5; -MUL_SX xvec0, xvec4; -ADD1_SX xvec4, xvec14; +MUL_SX xvec0, xvec4, xvec4; +ADD1_SX xvec4, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; -MUL_SX xvec1, xvec5; -ADD2_SX xvec5, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD2_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2057,26 +2034,26 @@ ADDQ $8*SIZE, ptrbb; #### Handle #### #if defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -2086,21 +2063,21 @@ BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 0*SIZE(C0, ldc, 1), xvec0; -LDL_SX 0*SIZE(C1), xvec1; -LDH_SX 0*SIZE(C1, ldc, 1), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 0*SIZE(C0, ldc, 1), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +LDH_SX 0*SIZE(C1, ldc, 1), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C0, ldc, 1); @@ -2191,59 +2168,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; @@ -2252,59 +2229,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; @@ -2313,59 +2290,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 32*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 36*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 40*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 44*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; @@ -2374,59 +2351,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 48*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 52*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 56*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 60*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; ADDQ $64*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -2448,59 +2425,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; @@ -2509,59 +2486,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2581,59 +2558,59 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec13; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec13, xvec13; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec9; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec9, xvec9; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec13; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec9; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec13, xvec13; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec9, xvec9; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec12; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec12, xvec12; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec8; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec8, xvec8; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec12; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec8; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec12, xvec12; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec8, xvec8; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb @@ -2641,53 +2618,53 @@ ADDQ $4*SIZE, ptrbb #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec9, xvec7; +ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec8, xvec7; +ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec13, xvec7; +SUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec12, xvec7; +SUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec11, xvec7; +SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec10, xvec7; +SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec9, xvec7; +SUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec8, xvec7; +SUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; @@ -2699,28 +2676,28 @@ SHUF_SX $0xb1, xvec10, xvec10; SHUF_SX $0xb1, xvec9, xvec9; SHUF_SX $0xb1, xvec8, xvec8; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec13, xvec7; +ADDSUB_SX xvec13, xvec7, xvec7; MOV_SX xvec7, xvec13; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec12, xvec7; +ADDSUB_SX xvec12, xvec7, xvec7; MOV_SX xvec7, xvec12; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec9, xvec7; +ADDSUB_SX xvec9, xvec7, xvec7; MOV_SX xvec7, xvec9; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec8, xvec7; +ADDSUB_SX xvec8, xvec7, xvec7; MOV_SX xvec7, xvec8; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -2736,50 +2713,50 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec13, xvec3; -MUL_SX xvec7, xvec13; -MUL_SX xvec6, xvec3; -ADDSUB_SX xvec3, xvec13; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec6, xvec3, xvec3; +ADDSUB_SX xvec3, xvec13, xvec13; VPERMILP_SX $0xb1,xvec12, xvec2; -MUL_SX xvec7, xvec12; -MUL_SX xvec6, xvec2; -ADDSUB_SX xvec2, xvec12; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec6, xvec2, xvec2; +ADDSUB_SX xvec2, xvec12, xvec12; VPERMILP_SX $0xb1,xvec11, xvec1; -MUL_SX xvec7, xvec11; -MUL_SX xvec6, xvec1; -ADDSUB_SX xvec1, xvec11; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; -MUL_SX xvec7, xvec10; -MUL_SX xvec6, xvec0; -ADDSUB_SX xvec0, xvec10; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec6, xvec0, xvec0; +ADDSUB_SX xvec0, xvec10, xvec10; VPERMILP_SX $0xb1,xvec9, xvec5; -MUL_SX xvec7, xvec9; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec9; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec9, xvec9; VPERMILP_SX $0xb1,xvec8, xvec4; -MUL_SX xvec7, xvec8; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec8; +MUL_SX xvec7, xvec8, xvec8; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec8, xvec8; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -LDL_SX 8*SIZE(C0), xvec2; -LDH_SX 10*SIZE(C1), xvec2; -LDL_SX 12*SIZE(C0), xvec3; -LDH_SX 14*SIZE(C1), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 8*SIZE(C0), xvec2, xvec2; +LDH_SX 10*SIZE(C1), xvec2, xvec2; +LDL_SX 12*SIZE(C0), xvec3, xvec3; +LDH_SX 14*SIZE(C1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2790,18 +2767,18 @@ STH_SX xvec13, 10*SIZE(C1); STL_SX xvec12, 12*SIZE(C0); STH_SX xvec12, 14*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -LDL_SX 4*SIZE(C1), xvec5; -LDH_SX 6*SIZE(C0), xvec5; -LDL_SX 8*SIZE(C1), xvec6; -LDH_SX 10*SIZE(C0), xvec6; -LDL_SX 12*SIZE(C1), xvec7; -LDH_SX 14*SIZE(C0), xvec7; -ADD_SX xvec4, xvec11; -ADD_SX xvec5, xvec10; -ADD_SX xvec6, xvec9; -ADD_SX xvec7, xvec8; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +LDL_SX 8*SIZE(C1), xvec6, xvec6; +LDH_SX 10*SIZE(C0), xvec6, xvec6; +LDL_SX 12*SIZE(C1), xvec7, xvec7; +LDH_SX 14*SIZE(C0), xvec7, xvec7; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; +ADD_SX xvec6, xvec9, xvec9; +ADD_SX xvec7, xvec8, xvec8; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); @@ -2872,31 +2849,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -2906,31 +2883,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; @@ -2940,31 +2917,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 16*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 20*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; @@ -2974,31 +2951,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 24*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 28*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; ADDQ $32*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -3020,31 +2997,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -3054,31 +3031,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -3099,31 +3076,31 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec14; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec14, xvec14; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec10; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec10, xvec10; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec14; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec10; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec14, xvec14; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec10, xvec10; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3131,29 +3108,29 @@ ADDQ $4*SIZE, ptrbb; #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec14, xvec7; +SUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec11, xvec7; +SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec10, xvec7; +SUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; @@ -3161,16 +3138,16 @@ SHUF_SX $0xb1, xvec14, xvec14; SHUF_SX $0xb1, xvec11, xvec11; SHUF_SX $0xb1, xvec10, xvec10; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec14, xvec7; +ADDSUB_SX xvec14, xvec7, xvec7; MOV_SX xvec7, xvec14; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec10, xvec7; +ADDSUB_SX xvec10, xvec7, xvec7; MOV_SX xvec7, xvec10; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec14, xvec14; @@ -3182,40 +3159,40 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec14, xvec4; -MUL_SX xvec7, xvec14; -MUL_SX xvec6, xvec4; -ADDSUB_SX xvec4, xvec14; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec6, xvec4, xvec4; +ADDSUB_SX xvec4, xvec14, xvec14; VPERMILP_SX $0xb1,xvec11, xvec1; -MUL_SX xvec7, xvec11; -MUL_SX xvec6, xvec1; -ADDSUB_SX xvec1, xvec11; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; VPERMILP_SX $0xb1,xvec10, xvec0; -MUL_SX xvec7, xvec10; -MUL_SX xvec6, xvec0; -ADDSUB_SX xvec0, xvec10; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec6, xvec0, xvec0; +ADDSUB_SX xvec0, xvec10, xvec10; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); STL_SX xvec14, 4*SIZE(C0); STH_SX xvec14, 6*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -LDL_SX 4*SIZE(C1), xvec5; -LDH_SX 6*SIZE(C0), xvec5; -ADD_SX xvec4, xvec11; -ADD_SX xvec5, xvec10; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); @@ -3277,17 +3254,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -3297,17 +3274,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 3 #### EDUP_SX 8*SIZE(ptrbb), xvec4; @@ -3317,17 +3294,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 8*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 4 #### EDUP_SX 12*SIZE(ptrbb), xvec4; @@ -3337,17 +3314,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 12*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -3369,17 +3346,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; #### Unroll 2 ##### EDUP_SX 4*SIZE(ptrbb), xvec4; @@ -3389,17 +3366,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 4*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -3419,17 +3396,17 @@ SHUF_SX $0x4e, xvec5, xvec7; LD_SX 0*SIZE(ptrba), xvec0; MOV_SX xvec0, xvec1; -MUL_SX xvec4, xvec0; -ADD1_SX xvec0, xvec15; +MUL_SX xvec4, xvec0, xvec0; +ADD1_SX xvec0, xvec15, xvec15; SHUF_SX $0xb1, xvec1, xvec2; -MUL_SX xvec6, xvec1; -ADD1_SX xvec1, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADD1_SX xvec1, xvec11, xvec11; MOV_SX xvec2, xvec3; -MUL_SX xvec5, xvec2; -ADD2_SX xvec2, xvec15; -MUL_SX xvec7, xvec3; -ADD2_SX xvec3, xvec11; +MUL_SX xvec5, xvec2, xvec2; +ADD2_SX xvec2, xvec15, xvec15; +MUL_SX xvec7, xvec3, xvec3; +ADD2_SX xvec3, xvec11, xvec11; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3437,26 +3414,26 @@ ADDQ $4*SIZE, ptrbb; #### Handle #### #if defined(RN) || defined(RT) || defined(CN) || defined(CT) XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -SUB_SX xvec11, xvec7; +SUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; XOR_SY yvec7, yvec7, yvec7; -ADDSUB_SX xvec11, xvec7; +ADDSUB_SX xvec11, xvec7, xvec7; MOV_SX xvec7, xvec11; SHUF_SX $0xb1, xvec15, xvec15; SHUF_SX $0xb1, xvec11, xvec11; @@ -3466,24 +3443,24 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; VPERMILP_SX $0xb1,xvec11, xvec1; -MUL_SX xvec7, xvec11; -MUL_SX xvec6, xvec1; -ADDSUB_SX xvec1, xvec11; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec6, xvec1, xvec1; +ADDSUB_SX xvec1, xvec11, xvec11; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -ADD_SX xvec4, xvec11; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; #endif STL_SX xvec11, 0*SIZE(C1); STH_SX xvec11, 2*SIZE(C0); @@ -3538,42 +3515,42 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 4*SIZE(ptrba), xvec0; LD_SX 8*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 5*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 6*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 7*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -3591,22 +3568,22 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -3622,12 +3599,12 @@ ALIGN_5 BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xb1, xvec2, xvec3; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec1; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3635,14 +3612,14 @@ ADDQ $4*SIZE, ptrbb; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(NR) || defined(NC) || defined(TR) || defined(TC) -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RN) || defined(RT) || defined(CN) || defined(CT) -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif @@ -3651,14 +3628,14 @@ BROAD_SX MEMALPHA_R,xvec7; BROAD_SX MEMALPHA_I,xvec6; ##### Multiply Alpha #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 0*SIZE(C1), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 0*SIZE(C1), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C1); @@ -3908,18 +3885,18 @@ ADDSUB_SY yvec4, yvec14, yvec14; EXTRA_SY $1, yvec15, xvec7; EXTRA_SY $1, yvec14, xvec6; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C0), xvec1; -LDL_SX 8*SIZE(C0), xvec2; -LDH_SX 10*SIZE(C0), xvec2; -LDL_SX 12*SIZE(C0), xvec3; -LDH_SX 14*SIZE(C0), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec7; -ADD_SX xvec2, xvec14; -ADD_SX xvec3, xvec6; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C0), xvec1, xvec1; +LDL_SX 8*SIZE(C0), xvec2, xvec2; +LDH_SX 10*SIZE(C0), xvec2, xvec2; +LDL_SX 12*SIZE(C0), xvec3, xvec3; +LDH_SX 14*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec7, xvec7; +ADD_SX xvec2, xvec14, xvec14; +ADD_SX xvec3, xvec6, xvec6; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -4103,12 +4080,12 @@ ADDSUB_SY yvec5, yvec15, yvec15; #### Writing back #### EXTRA_SY $1, yvec15, xvec7; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C0), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec7; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C0), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec7, xvec7; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -4163,42 +4140,42 @@ ALIGN_5 #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 3 #### LD_SX 8*SIZE(ptrba), xvec0; BROAD_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 5*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 4 #### LD_SX 12*SIZE(ptrba), xvec0; BROAD_SX 6*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 7*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -4216,22 +4193,22 @@ ALIGN_5 #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; #### Unroll 2 #### LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 3*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -4247,12 +4224,12 @@ ALIGN_5 #### Unroll 1 #### LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD1_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD1_SX xvec2, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; BROAD_SX 1*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec3; -ADD2_SX xvec3, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD2_SX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -4260,14 +4237,14 @@ ADDQ $2*SIZE, ptrbb; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif @@ -4276,13 +4253,13 @@ BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; #### Writng back #### VPERMILP_SX $0xb1,xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -4335,22 +4312,22 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -4368,12 +4345,12 @@ ALIGN_5 LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0xa0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xb1, xvec0, xvec1; SHUF_SX $0xf5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -4388,15 +4365,15 @@ ALIGN_5 .L343_bodyB: XOR_SY yvec0, yvec0, yvec0; XOR_SY yvec2, yvec2, yvec2; -LDL_SX 0*SIZE(ptrba), xvec0; -LDL_SX 0*SIZE(ptrbb), xvec2; +LDL_SX 0*SIZE(ptrba), xvec0, xvec0; +LDL_SX 0*SIZE(ptrbb), xvec2, xvec2; SHUF_SX $0xe0, xvec2, xvec3; -MUL_SX xvec0, xvec3; -ADD1_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD1_SX xvec3, xvec15, xvec15; SHUF_SX $0xe1, xvec0, xvec1; SHUF_SX $0xe5, xvec2, xvec4; -MUL_SX xvec1, xvec4; -ADD2_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD2_SX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -4404,29 +4381,29 @@ ADDQ $2*SIZE, ptrbb; #### Handle #### XOR_SY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -SUB_SX xvec15, xvec7; +SUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_SX $0xb1, xvec15, xvec15; -ADDSUB_SX xvec15, xvec7; +ADDSUB_SX xvec15, xvec7, xvec7; MOV_SX xvec7, xvec15; SHUF_SX $0xb1, xvec15, xvec15; #endif BROAD_SX MEMALPHA_R, xvec7; BROAD_SX MEMALPHA_I, xvec6; VPERMILP_SX $0xb1, xvec15, xvec5; -MUL_SX xvec7, xvec15; -MUL_SX xvec6, xvec5; -ADDSUB_SX xvec5, xvec15; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec6, xvec5, xvec5; +ADDSUB_SX xvec5, xvec15, xvec15; SHUF_SX $0x44, xvec15, xvec14; SHUF_SX $0xee, xvec15, xvec13; -ADD_SX xvec13, xvec14; +ADD_SX xvec13, xvec14, xvec14; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -ADD_SX xvec0, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec14, xvec14; #endif STL_SX xvec14, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA))||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -4458,6 +4435,8 @@ movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi diff --git a/kernel/x86_64/dgemm_kernel_4x8_sandy.S b/kernel/x86_64/dgemm_kernel_4x8_sandy.S index 603552464..3b1b2560e 100644 --- a/kernel/x86_64/dgemm_kernel_4x8_sandy.S +++ b/kernel/x86_64/dgemm_kernel_4x8_sandy.S @@ -143,71 +143,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef MOVQ #define MOVQ movq -#define XOR_SY vxorps #define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_DX vxorpd -#define LD_SY vmovaps #define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd -#define LDL_DX movlpd +#define LD_DX vmovapd +#define LDL_DX vmovlpd #define LDL_DY vmovlpd -#define LDH_DX movhpd +#define LDH_DX vmovhpd #define LDH_DY vmovhpd -#define ST_SY vmovaps #define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd -#define STL_DX movlpd +#define ST_DX vmovapd +#define STL_DX vmovlpd #define STL_DY vmovlpd -#define STH_DX movhpd +#define STH_DX vmovhpd #define STH_DY vmovhpd -#define EDUP_SY vmovsldup -#define ODUP_SY vmovshdup #define EDUP_DY vmovddup -#define ADD_SY vaddps #define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd +#define ADD_DX vaddpd #define ADD1_DY vaddpd #define ADD2_DY vaddpd #define ADDSUB_DY vaddsubpd -#define ADDSUB_SY vaddsubps -#define MUL_SY vmulps #define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_DX vmulpd -#define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 -#define SHUF_DX pshufd +#define SHUF_DX vpshufd -#define VPERMILP_SY vpermilps #define VPERMILP_DY vpermilpd -#define BROAD_SY vbroadcastss #define BROAD_DY vbroadcastsd -#define BROAD_SX -#define BROAD_DX movddup +#define BROAD_DX vmovddup -#define MOV_SY vmovaps #define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_DX vmovapd -#define REVS_SY vshufps #define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_DX vmovsd -#define EXTRA_SY vextractf128 #define EXTRA_DY vextractf128 PROLOGUE @@ -253,6 +231,8 @@ movq old_offset, %r11 #endif #endif +vzeroupper + vmovlps ALPHA, MEMALPHA movq old_bm, bm movq old_bn, bn @@ -988,14 +968,14 @@ EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec9,xvec1; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0),xvec15; -ADD_DX 2*SIZE(C1),xvec7; -ADD_DX 0*SIZE(C0,ldc,1),xvec13; -ADD_DX 2*SIZE(C1,ldc,1),xvec5; -ADD_DX 0*SIZE(C1),xvec11; -ADD_DX 2*SIZE(C0),xvec3; -ADD_DX 0*SIZE(C1,ldc,1),xvec9; -ADD_DX 2*SIZE(C0,ldc,1),xvec1; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec7, xvec7; +ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; +ADD_DX 2*SIZE(C1, ldc, 1), xvec5, xvec5; +ADD_DX 0*SIZE(C1), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec3, xvec3; +ADD_DX 0*SIZE(C1, ldc, 1), xvec9, xvec9; +ADD_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec7, 2*SIZE(C1); @@ -1025,18 +1005,18 @@ EXTRA_DY $1,yvec13,xvec5; EXTRA_DY $1,yvec11,xvec3; EXTRA_DY $1,yvec9,xvec1; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec14; -LDH_DX 1*SIZE(C0), xvec14; -LDL_DX 0*SIZE(C0, ldc, 1), xvec12; -LDH_DX 1*SIZE(C0, ldc, 1), xvec12; -LDL_DX 0*SIZE(C1), xvec10; -LDH_DX 1*SIZE(C1), xvec10; -LDL_DX 0*SIZE(C1, ldc, 1), xvec8; -LDH_DX 1*SIZE(C1, ldc, 1), xvec8; -ADD_DX xvec14, xvec15; -ADD_DX xvec12, xvec13; -ADD_DX xvec10, xvec11; -ADD_DX xvec8, xvec9; +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; +LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; +LDL_DX 0*SIZE(C1), xvec10, xvec10; +LDH_DX 1*SIZE(C1), xvec10, xvec10; +LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; +ADD_DX xvec14, xvec15, xvec15; +ADD_DX xvec12, xvec13, xvec13; +ADD_DX xvec10, xvec11, xvec11; +ADD_DX xvec8, xvec9, xvec9; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -1047,18 +1027,18 @@ STH_DX xvec11, 1*SIZE(C1); STL_DX xvec9, 0*SIZE(C1, ldc, 1); STH_DX xvec9, 1*SIZE(C1, ldc, 1); #ifndef TRMMKERNEL -LDL_DX 2*SIZE(C0), xvec0; -LDH_DX 3*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0, ldc, 1), xvec2; -LDH_DX 3*SIZE(C0, ldc, 1), xvec2; -LDL_DX 2*SIZE(C1), xvec4; -LDH_DX 3*SIZE(C1), xvec4; -LDL_DX 2*SIZE(C1, ldc, 1), xvec6; -LDH_DX 3*SIZE(C1, ldc, 1), xvec6; -ADD_DX xvec0, xvec3; -ADD_DX xvec2, xvec1; -ADD_DX xvec4, xvec7; -ADD_DX xvec6, xvec5; +LDL_DX 2*SIZE(C0), xvec0, xvec0; +LDH_DX 3*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_DX 3*SIZE(C0, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C1), xvec4, xvec4; +LDH_DX 3*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_DX 3*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_DX xvec0, xvec3, xvec3; +ADD_DX xvec2, xvec1, xvec1; +ADD_DX xvec4, xvec7, xvec7; +ADD_DX xvec6, xvec5, xvec5; #endif STL_DX xvec3, 2*SIZE(C0); STH_DX xvec3, 3*SIZE(C0); @@ -1128,72 +1108,72 @@ ALIGN_5; ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 6*SIZE(ptrbb), xvec7; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; LD_DX 2*SIZE(ptrba), xvec1; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; #### Unroll time 2 #### LD_DX 8*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; -MUL_DX xvec1, xvec6; -ADD_DX xvec6, xvec15; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; LD_DX 10*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec7; -ADD_DX xvec7, xvec11; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; LD_DX 4*SIZE(ptrba), xvec0; -MUL_DX xvec1, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; ##### Unroll time 3 #### LD_DX 12*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 14*SIZE(ptrbb), xvec7; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; ADDQ $16*SIZE, ptrbb; LD_DX 6*SIZE(ptrba), xvec1; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; ADDQ $8*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; #### Unroll time 4 #### LD_DX 0*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; -MUL_DX xvec1, xvec6; -ADD_DX xvec6, xvec15; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec7; -ADD_DX xvec7, xvec11; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; LD_DX 0*SIZE(ptrba), xvec0; -MUL_DX xvec1, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; DECQ k; JG .L10_bodyB; ALIGN_5 @@ -1210,39 +1190,39 @@ ALIGN_5 ##### Unroll time 1 #### LD_DX 4*SIZE(ptrbb), xvec6; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 6*SIZE(ptrbb), xvec7; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec6, xvec4; ADDQ $4*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; #### Unroll time 2 #### LD_DX 0*SIZE(ptrbb), xvec2; SHUF_DX $0x4e, xvec7, xvec5; -MUL_DX xvec1, xvec6; -ADD_DX xvec6, xvec15; +MUL_DX xvec1, xvec6, xvec6; +ADD_DX xvec6, xvec15, xvec15; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec7; -ADD_DX xvec7, xvec11; +MUL_DX xvec1, xvec7, xvec7; +ADD_DX xvec7, xvec11, xvec11; LD_DX 0*SIZE(ptrba), xvec0; -MUL_DX xvec1, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec1, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; SHUF_DX $0x4e, xvec2, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; .L11_loopE:; #ifndef TRMMKERNEL @@ -1255,35 +1235,35 @@ JLE .L12_loopE; ALIGN_5 .L12_bodyB:; SHUF_DX $0x4e, xvec3, xvec5; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; -MUL_DX xvec0, xvec3; -ADD_DX xvec3, xvec11; +MUL_DX xvec0, xvec3, xvec3; +ADD_DX xvec3, xvec11, xvec11; ADDQ $2*SIZE, ptrba; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec13; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec13, xvec13; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec9; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec9, xvec9; .L12_loopE:; #### Load Alpha #### BROAD_DX MEMALPHA, xvec7; #### Multiply Alpha #### -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec13; -MUL_DX xvec7, xvec11; -MUL_DX xvec7, xvec9; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec13, xvec13; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec9, xvec9; #### Reverse the Results #### MOV_DX xvec15, xvec6; -REVS_DX xvec13, xvec15; -REVS_DX xvec6, xvec13; +REVS_DX xvec13, xvec15, xvec15; +REVS_DX xvec6, xvec13, xvec13; MOV_DX xvec11, xvec6; -REVS_DX xvec9, xvec11; -REVS_DX xvec6, xvec9; +REVS_DX xvec9, xvec11, xvec11; +REVS_DX xvec6, xvec9, xvec9; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -1292,10 +1272,10 @@ JNE .L12_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec13; -ADD_DX 0*SIZE(C0, ldc, 1), xvec15; -ADD_DX 0*SIZE(C1), xvec9; -ADD_DX 0*SIZE(C1, ldc, 1), xvec11; +ADD_DX 0*SIZE(C0), xvec13, xvec13; +ADD_DX 0*SIZE(C0, ldc, 1), xvec15, xvec15; +ADD_DX 0*SIZE(C1), xvec9, xvec9; +ADD_DX 0*SIZE(C1, ldc, 1), xvec11, xvec11; #endif ST_DX xvec13, 0*SIZE(C0); ST_DX xvec15, 0*SIZE(C0, ldc, 1); @@ -1317,18 +1297,18 @@ JMP .L9_loopE; ALIGN_5 .L12_loopEx: #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec14; -LDH_DX 1*SIZE(C0), xvec14; -LDL_DX 0*SIZE(C0, ldc, 1), xvec12; -LDH_DX 1*SIZE(C0, ldc, 1), xvec12; -LDL_DX 0*SIZE(C1), xvec10; -LDH_DX 1*SIZE(C1), xvec10; -LDL_DX 0*SIZE(C1, ldc, 1), xvec8; -LDH_DX 1*SIZE(C1, ldc, 1), xvec8; -ADD_DX xvec14, xvec13; -ADD_DX xvec12, xvec15; -ADD_DX xvec10, xvec9; -ADD_DX xvec8, xvec11; +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +LDL_DX 0*SIZE(C0, ldc, 1), xvec12, xvec12; +LDH_DX 1*SIZE(C0, ldc, 1), xvec12, xvec12; +LDL_DX 0*SIZE(C1), xvec10, xvec10; +LDH_DX 1*SIZE(C1), xvec10, xvec10; +LDL_DX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_DX 1*SIZE(C1, ldc, 1), xvec8, xvec8; +ADD_DX xvec14, xvec13, xvec13; +ADD_DX xvec12, xvec15, xvec15; +ADD_DX xvec10, xvec9, xvec9; +ADD_DX xvec8, xvec11, xvec11; #endif STL_DX xvec13, 0*SIZE(C0); STH_DX xvec13, 1*SIZE(C0); @@ -1455,12 +1435,12 @@ MUL_DY yvec15, yvec7, yvec15; #### Writing Back #### EXTRA_DY $1, yvec15, xvec7; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 0*SIZE(C0, ldc, 1), xvec0; -LDL_DX 0*SIZE(C1), xvec1; -LDH_DX 0*SIZE(C1, ldc, 1), xvec1; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 0*SIZE(C0, ldc, 1), xvec0, xvec0; +LDL_DX 0*SIZE(C1), xvec1, xvec1; +LDH_DX 0*SIZE(C1, ldc, 1), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 0*SIZE(C0, ldc, 1); @@ -1549,151 +1529,151 @@ ALIGN_5; LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 2 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 10*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 12*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 14*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 3 #### LD_DX 16*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 18*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 20*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 22*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 4 #### LD_DX 24*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrbb; LD_DX 26*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 28*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 30*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; ADDQ $32*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; DECQ k; JG .L211_bodyB; ALIGN_5 @@ -1712,77 +1692,77 @@ ALIGN_5; LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; #### Unroll time 2 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; LD_DX 10*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 12*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 14*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; ADDQ $16*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; .L212_loopE: #ifndef TRMMKERNEL @@ -1798,65 +1778,65 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; LD_DX 4*SIZE(ptrba), xvec2; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec13; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec13, xvec13; LD_DX 6*SIZE(ptrba), xvec3; SHUF_DX $0x4e, xvec7, xvec4; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec12; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec12, xvec12; ADDQ $8*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; MOV_DX xvec5, xvec6; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; MOV_DX xvec6, xvec7; -MUL_DX xvec2, xvec6; -ADD_DX xvec6, xvec9; +MUL_DX xvec2, xvec6, xvec6; +ADD_DX xvec6, xvec9, xvec9; -MUL_DX xvec3, xvec7; -ADD_DX xvec7, xvec8; +MUL_DX xvec3, xvec7, xvec7; +ADD_DX xvec7, xvec8, xvec8; .L213_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec14; -MUL_DX xvec7, xvec13; -MUL_DX xvec7, xvec12; -MUL_DX xvec7, xvec11; -MUL_DX xvec7, xvec10; -MUL_DX xvec7, xvec9; -MUL_DX xvec7, xvec8; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec14, xvec14; +MUL_DX xvec7, xvec13, xvec13; +MUL_DX xvec7, xvec12, xvec12; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec10, xvec10; +MUL_DX xvec7, xvec9, xvec9; +MUL_DX xvec7, xvec8, xvec8; #### Reverse ##### MOV_DX xvec15, xvec6; -REVS_DX xvec11, xvec15; -REVS_DX xvec6, xvec11; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; MOV_DX xvec14, xvec6; -REVS_DX xvec10, xvec14; -REVS_DX xvec6, xvec10; +REVS_DX xvec10, xvec14, xvec14; +REVS_DX xvec6, xvec10, xvec10; MOV_DX xvec13, xvec6; -REVS_DX xvec9, xvec13; -REVS_DX xvec6, xvec9; +REVS_DX xvec9, xvec13, xvec13; +REVS_DX xvec6, xvec9, xvec9; MOV_DX xvec12, xvec6; -REVS_DX xvec8, xvec12; -REVS_DX xvec6, xvec8; +REVS_DX xvec8, xvec12, xvec12; +REVS_DX xvec6, xvec8, xvec8; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -1865,14 +1845,14 @@ JNE .L213_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec11; -ADD_DX 2*SIZE(C0), xvec10; -ADD_DX 4*SIZE(C0), xvec9; -ADD_DX 6*SIZE(C0), xvec8; -ADD_DX 0*SIZE(C1), xvec15; -ADD_DX 2*SIZE(C1), xvec14; -ADD_DX 4*SIZE(C1), xvec13; -ADD_DX 6*SIZE(C1), xvec12; +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec10, xvec10; +ADD_DX 4*SIZE(C0), xvec9, xvec9; +ADD_DX 6*SIZE(C0), xvec8, xvec8; +ADD_DX 0*SIZE(C1), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec14, xvec14; +ADD_DX 4*SIZE(C1), xvec13, xvec13; +ADD_DX 6*SIZE(C1), xvec12, xvec12; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec10, 2*SIZE(C0); @@ -1900,18 +1880,18 @@ JMP .L21_loopE; ALIGN_5 .L213_loopEx:; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -LDL_DX 4*SIZE(C0), xvec2; -LDH_DX 5*SIZE(C0), xvec2; -LDL_DX 6*SIZE(C0), xvec3; -LDH_DX 7*SIZE(C0), xvec3; -ADD_DX xvec0, xvec11; -ADD_DX xvec1, xvec10; -ADD_DX xvec2, xvec9; -ADD_DX xvec3, xvec8; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C0), xvec3, xvec3; +LDH_DX 7*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec11, xvec11; +ADD_DX xvec1, xvec10, xvec10; +ADD_DX xvec2, xvec9, xvec9; +ADD_DX xvec3, xvec8, xvec8; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); @@ -1922,18 +1902,18 @@ STH_DX xvec9, 5*SIZE(C0); STL_DX xvec8, 6*SIZE(C0); STH_DX xvec8, 7*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec4; -LDH_DX 1*SIZE(C1), xvec4; -LDL_DX 2*SIZE(C1), xvec5; -LDH_DX 3*SIZE(C1), xvec5; -LDL_DX 4*SIZE(C1), xvec6; -LDH_DX 5*SIZE(C1), xvec6; -LDL_DX 6*SIZE(C1), xvec7; -LDH_DX 7*SIZE(C1), xvec7; -ADD_DX xvec4, xvec15; -ADD_DX xvec5, xvec14; -ADD_DX xvec6, xvec13; -ADD_DX xvec7, xvec12; +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1), xvec5, xvec5; +LDH_DX 3*SIZE(C1), xvec5, xvec5; +LDL_DX 4*SIZE(C1), xvec6, xvec6; +LDH_DX 5*SIZE(C1), xvec6, xvec6; +LDL_DX 6*SIZE(C1), xvec7, xvec7; +LDH_DX 7*SIZE(C1), xvec7, xvec7; +ADD_DX xvec4, xvec15, xvec15; +ADD_DX xvec5, xvec14, xvec14; +ADD_DX xvec6, xvec13, xvec13; +ADD_DX xvec7, xvec12, xvec12; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); @@ -2000,79 +1980,79 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 2 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 3 #### LD_DX 8*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 10*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 4 #### LD_DX 12*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrbb; LD_DX 14*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; ADDQ $16*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; DECQ k; JG .L221_bodyB; ALIGN_5 @@ -2090,40 +2070,40 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; #### Unroll time 2 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; LD_DX 6*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; ADDQ $8*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; .L222_loopE: #ifndef TRMMKERNEL @@ -2139,37 +2119,37 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrbb; LD_DX 2*SIZE(ptrba), xvec1; SHUF_DX $0x4e, xvec5, xvec4; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec14; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; MOV_DX xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec11; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec11, xvec11; -MUL_DX xvec1, xvec5; -ADD_DX xvec5, xvec10; +MUL_DX xvec1, xvec5, xvec5; +ADD_DX xvec5, xvec10, xvec10; .L223_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec14; -MUL_DX xvec7, xvec11; -MUL_DX xvec7, xvec10; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec14, xvec14; +MUL_DX xvec7, xvec11, xvec11; +MUL_DX xvec7, xvec10, xvec10; #### Reverse ##### MOV_DX xvec15, xvec6; -REVS_DX xvec11, xvec15; -REVS_DX xvec6, xvec11; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; MOV_DX xvec14, xvec6; -REVS_DX xvec10, xvec14; -REVS_DX xvec6, xvec10; +REVS_DX xvec10, xvec14, xvec14; +REVS_DX xvec6, xvec10, xvec10; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -2178,10 +2158,10 @@ JNE .L223_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec11; -ADD_DX 2*SIZE(C0), xvec10; -ADD_DX 0*SIZE(C1), xvec15; -ADD_DX 2*SIZE(C1), xvec14; +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 2*SIZE(C0), xvec10, xvec10; +ADD_DX 0*SIZE(C1), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec14, xvec14; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec10, 2*SIZE(C0); @@ -2203,24 +2183,24 @@ JMP .L22_loopE; ALIGN_5 .L223_loopEx:; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -ADD_DX xvec0, xvec11; -ADD_DX xvec1, xvec10; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +ADD_DX xvec0, xvec11, xvec11; +ADD_DX xvec1, xvec10, xvec10; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); STL_DX xvec10, 2*SIZE(C0); STH_DX xvec10, 3*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec4; -LDH_DX 1*SIZE(C1), xvec4; -LDL_DX 2*SIZE(C1), xvec5; -LDH_DX 3*SIZE(C1), xvec5; -ADD_DX xvec4, xvec15; -ADD_DX xvec5, xvec14; +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +LDL_DX 2*SIZE(C1), xvec5, xvec5; +LDH_DX 3*SIZE(C1), xvec5, xvec5; +ADD_DX xvec4, xvec15, xvec15; +ADD_DX xvec5, xvec14, xvec14; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); @@ -2278,38 +2258,38 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 2 #### LD_DX 2*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 3 #### LD_DX 4*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 4 #### LD_DX 6*SIZE(ptrba), xvec0; LD_DX 6*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $8*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; DECQ k; JG .L231_bodyB; @@ -2328,20 +2308,20 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; #### Unroll time 2 #### LD_DX 2*SIZE(ptrba), xvec0; LD_DX 2*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $4*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; ADDQ $4*SIZE, ptrbb; .L232_loopE: #ifndef TRMMKERNEL @@ -2357,21 +2337,21 @@ ALIGN_5 LD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec4; SHUF_DX $0x4e, xvec4, xvec5; -MUL_DX xvec0, xvec4; -ADD_DX xvec4, xvec15; +MUL_DX xvec0, xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; ADDQ $2*SIZE, ptrba; -MUL_DX xvec0, xvec5; -ADD_DX xvec5, xvec11; +MUL_DX xvec0, xvec5, xvec5; +ADD_DX xvec5, xvec11, xvec11; ADDQ $2*SIZE, ptrbb; .L233_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; -MUL_DX xvec7, xvec11; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec7, xvec11, xvec11; #### Reverse ##### MOV_DX xvec15, xvec6; -REVS_DX xvec11, xvec15; -REVS_DX xvec6, xvec11; +REVS_DX xvec11, xvec15, xvec15; +REVS_DX xvec6, xvec11, xvec11; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -2380,8 +2360,8 @@ JNE .L233_loopEx; ALIGN_5 #### Writing Back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec11; -ADD_DX 0*SIZE(C1), xvec15; +ADD_DX 0*SIZE(C0), xvec11, xvec11; +ADD_DX 0*SIZE(C1), xvec15, xvec15; #endif ST_DX xvec11, 0*SIZE(C0); ST_DX xvec15, 0*SIZE(C1); @@ -2401,16 +2381,16 @@ JMP .L23_loopE; ALIGN_5 .L233_loopEx:; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -ADD_DX xvec0, xvec11; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +ADD_DX xvec0, xvec11, xvec11; #endif STL_DX xvec11, 0*SIZE(C0); STH_DX xvec11, 1*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec4; -LDH_DX 1*SIZE(C1), xvec4; -ADD_DX xvec4, xvec15; +LDL_DX 0*SIZE(C1), xvec4, xvec4; +LDH_DX 1*SIZE(C1), xvec4, xvec4; +ADD_DX xvec4, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C1); STH_DX xvec15, 1*SIZE(C1); @@ -2462,23 +2442,23 @@ ALIGN_5 .L241_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; BROAD_DX 1*SIZE(ptrba), xvec1; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; BROAD_DX 2*SIZE(ptrba), xvec0; LD_DX 4*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; BROAD_DX 3*SIZE(ptrba), xvec1; LD_DX 6*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -2496,13 +2476,13 @@ ALIGN_5 .L242_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; BROAD_DX 1*SIZE(ptrba), xvec1; LD_DX 2*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L242_loopE: @@ -2517,18 +2497,18 @@ ALIGN_5 .L243_bodyB: BROAD_DX 0*SIZE(ptrba), xvec0; LD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; ADDQ $1*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L243_loopE: BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; +MUL_DX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 0*SIZE(C1), xvec0; -ADD_DX xvec0, xvec15; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 0*SIZE(C1), xvec0, xvec0; +ADD_DX xvec0, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 0*SIZE(C1); @@ -2705,10 +2685,10 @@ ALIGN_5 EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec15; -ADD_DX 2*SIZE(C0), xvec13; -ADD_DX 4*SIZE(C0), xvec14; -ADD_DX 6*SIZE(C0), xvec12; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0), xvec13, xvec13; +ADD_DX 4*SIZE(C0), xvec14, xvec14; +ADD_DX 6*SIZE(C0), xvec12, xvec12; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec13, 2*SIZE(C0); @@ -2733,18 +2713,18 @@ ALIGN_5 EXTRA_DY $1, yvec15, xvec13; EXTRA_DY $1, yvec14, xvec12; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec11; -LDH_DX 1*SIZE(C0), xvec11; -LDL_DX 2*SIZE(C0), xvec10; -LDH_DX 3*SIZE(C0), xvec10; -LDL_DX 4*SIZE(C0), xvec9; -LDH_DX 5*SIZE(C0), xvec9; -LDL_DX 6*SIZE(C0), xvec8; -LDH_DX 7*SIZE(C0), xvec8; -ADD_DX xvec11, xvec15; -ADD_DX xvec10, xvec13; -ADD_DX xvec9, xvec14; -ADD_DX xvec8, xvec12; +LDL_DX 0*SIZE(C0), xvec11, xvec11; +LDH_DX 1*SIZE(C0), xvec11, xvec11; +LDL_DX 2*SIZE(C0), xvec10, xvec10; +LDH_DX 3*SIZE(C0), xvec10, xvec10; +LDL_DX 4*SIZE(C0), xvec9, xvec9; +LDH_DX 5*SIZE(C0), xvec9, xvec9; +LDL_DX 6*SIZE(C0), xvec8, xvec8; +LDH_DX 7*SIZE(C0), xvec8, xvec8; +ADD_DX xvec11, xvec15, xvec15; +ADD_DX xvec10, xvec13, xvec13; +ADD_DX xvec9, xvec14, xvec14; +ADD_DX xvec8, xvec12, xvec12; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2878,8 +2858,8 @@ ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec15; -ADD_DX 2*SIZE(C0), xvec14; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0), xvec14, xvec14; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec14, 2*SIZE(C0); @@ -2900,12 +2880,12 @@ ALIGN_5 #### Writing Back #### EXTRA_DY $1, yvec15, xvec14; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec13; -LDH_DX 1*SIZE(C0), xvec13; -LDL_DX 2*SIZE(C0), xvec12; -LDH_DX 3*SIZE(C0), xvec12; -ADD_DX xvec13, xvec15; -ADD_DX xvec12, xvec14; +LDL_DX 0*SIZE(C0), xvec13, xvec13; +LDH_DX 1*SIZE(C0), xvec13, xvec13; +LDL_DX 2*SIZE(C0), xvec12, xvec12; +LDH_DX 3*SIZE(C0), xvec12, xvec12; +ADD_DX xvec13, xvec15, xvec15; +ADD_DX xvec12, xvec14, xvec14; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2959,23 +2939,23 @@ ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; LD_DX 4*SIZE(ptrba), xvec4; BROAD_DX 2*SIZE(ptrbb), xvec5; -MUL_DX xvec4, xvec5; -ADD_DX xvec5, xvec15; +MUL_DX xvec4, xvec5, xvec5; +ADD_DX xvec5, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec6; BROAD_DX 3*SIZE(ptrbb), xvec7; -MUL_DX xvec6, xvec7; -ADD_DX xvec7, xvec15; +MUL_DX xvec6, xvec7, xvec7; +ADD_DX xvec7, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; @@ -2993,13 +2973,13 @@ ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADD_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADD_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L332_loopE: @@ -3014,18 +2994,18 @@ ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD_DX xvec2, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L333_loopE: #### Multiply Alpha #### BROAD_DX MEMALPHA, xvec7; -MUL_DX xvec7, xvec15; +MUL_DX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec14; -LDH_DX 1*SIZE(C0), xvec14; -ADD_DX xvec14, xvec15; +LDL_DX 0*SIZE(C0), xvec14, xvec14; +LDH_DX 1*SIZE(C0), xvec14, xvec14; +ADD_DX xvec14, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3074,25 +3054,25 @@ SARQ $2, k; JLE .L341_loopE; ALIGN_5 .L341_bodyB: -movsd 0*SIZE(ptrba), xvec0; -movsd 0*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; -movsd 1*SIZE(ptrba), xvec0; -movsd 1*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 1*SIZE(ptrba), xvec0; +vmovsd 1*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; -movsd 2*SIZE(ptrba), xvec0; -movsd 2*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 2*SIZE(ptrba), xvec0; +vmovsd 2*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; -movsd 3*SIZE(ptrba), xvec0; -movsd 3*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 3*SIZE(ptrba), xvec0; +vmovsd 3*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; @@ -3108,15 +3088,15 @@ TEST $2, %rax; JLE .L342_loopE; ALIGN_5 .L342_bodyB: -movsd 0*SIZE(ptrba), xvec0; -movsd 0*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; -movsd 1*SIZE(ptrba), xvec0; -movsd 1*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 1*SIZE(ptrba), xvec0; +vmovsd 1*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; addq $2*SIZE, ptrba; addq $2*SIZE, ptrbb; @@ -3130,20 +3110,20 @@ TEST $1, %rax; JLE .L343_loopE; ALIGN_5 .L343_bodyB: -movsd 0*SIZE(ptrba), xvec0; -movsd 0*SIZE(ptrbb), xvec1; -mulsd xvec0, xvec1; -addsd xvec1, xvec15; +vmovsd 0*SIZE(ptrba), xvec0; +vmovsd 0*SIZE(ptrbb), xvec1; +vmulsd xvec0, xvec1, xvec1; +vaddsd xvec1, xvec15, xvec15; addq $1*SIZE, ptrba; addq $1*SIZE, ptrbb; .L343_loopE: #### Writing Back #### -movsd MEMALPHA, xvec7; -mulsd xvec7, xvec15; +vmovsd MEMALPHA, xvec7; +vmulsd xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -movsd 0*SIZE(C0), xvec0; -addsd xvec0, xvec15; +vmovsd 0*SIZE(C0), xvec0; +vaddsd xvec0, xvec15, xvec15; #endif movsd xvec15, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) @@ -3170,6 +3150,9 @@ movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; + +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi diff --git a/kernel/x86_64/sgemm_kernel_8x8_sandy.S b/kernel/x86_64/sgemm_kernel_8x8_sandy.S index 59458effe..20ddcaa8e 100644 --- a/kernel/x86_64/sgemm_kernel_8x8_sandy.S +++ b/kernel/x86_64/sgemm_kernel_8x8_sandy.S @@ -146,75 +146,53 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define MOVQ movq #define XOR_SY vxorps -#define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_SX vxorps #define LD_SY vmovaps -#define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd -#define LDL_SX movlps +#define LD_SX vmovaps +#define LDL_SX vmovlps #define LDL_SY vmovlps -#define LDH_SX movhps +#define LDH_SX vmovhps #define LDH_SY vmovhps #define ST_SY vmovaps -#define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd -#define STL_SX movlps +#define ST_SX vmovaps +#define STL_SX vmovlps #define STL_SY vmovlps -#define STH_SX movhps +#define STH_SX vmovhps #define STH_SY vmovhps #define EDUP_SY vmovsldup #define ODUP_SY vmovshdup -#define EDUP_SX movsldup -#define ODUP_SX movshdup -#define EDUP_DY vmovddup +#define EDUP_SX vmovsldup +#define ODUP_SX vmovshdup #define ADD_SY vaddps -#define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd +#define ADD_SX vaddps #define ADD1_DY vaddpd -#define ADD2_DY vaddpd -#define ADDSUB_DY vaddsubpd #define ADDSUB_SY vaddsubps #define MUL_SY vmulps -#define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_SX vmulps #define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 -#define SHUF_DX pshufd -#define SHUF_SX pshufd +#define SHUF_SX vpshufd #define VPERMILP_SY vpermilps #define VPERMILP_SX vpermilps -#define VPERMILP_DY vpermilpd #define BROAD_SY vbroadcastss -#define BROAD_DY vbroadcastsd #define BROAD_SX vbroadcastss -#define BROAD_DX movddup #define MOV_SY vmovaps -#define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_SX vmovaps #define REVS_SY vshufps -#define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_SX vshufps #define EXTRA_SY vextractf128 -#define EXTRA_DY vextractf128 PROLOGUE @@ -260,6 +238,8 @@ movq old_offset, %r11 #endif #endif +vzeroupper + vmovlps ALPHA, MEMALPHA movq old_bm, bm movq old_bn, bn @@ -864,125 +844,125 @@ ALIGN_4 #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 8*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; #### Unroll time 2 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 12*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 16*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 16*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; #### Unroll time 3 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 20*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 24*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 24*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; ADDQ $16*SIZE, ptrba; #### Unroll time 4 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 28*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $32*SIZE, ptrbb; LD_SX 0*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; DECQ k; JG .L8_bodyB; ALIGN_4 @@ -997,65 +977,65 @@ ALIGN_4 .L9_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 8*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; #### Unroll time 2 #### ADDQ $8*SIZE, ptrba; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 12*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrbb; LD_SX 0*SIZE(ptrba), xvec0; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; EDUP_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec9; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; .L9_loopE: #ifndef TRMMKERNEL @@ -1068,57 +1048,57 @@ ALIGN_4 .L10_bodyB: #### Unroll time 1 #### SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $4*SIZE, ptrba; EDUP_SX 4*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrbb; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec11; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec11, xvec11; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec10; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec10, xvec10; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec9; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec8; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec9, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec8, xvec8; .L10_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; -MUL_SX xvec7, xvec11; -MUL_SX xvec7, xvec10; -MUL_SX xvec7, xvec9; -MUL_SX xvec7, xvec8; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec7, xvec8, xvec8; #### Reverse Result #### MOV_SX xvec15, xvec7; -REVS_SX $0xe4, xvec13, xvec15; -REVS_SX $0xe4, xvec7, xvec13; +REVS_SX $0xe4, xvec13, xvec15, xvec15; +REVS_SX $0xe4, xvec7, xvec13, xvec13; MOV_SX xvec14, xvec7; -REVS_SX $0xe4, xvec12, xvec14; -REVS_SX $0xe4, xvec7, xvec12; +REVS_SX $0xe4, xvec12, xvec14, xvec14; +REVS_SX $0xe4, xvec7, xvec12, xvec12; MOV_SX xvec11, xvec7; -REVS_SX $0xe4, xvec9, xvec11; -REVS_SX $0xe4, xvec7, xvec9; +REVS_SX $0xe4, xvec9, xvec11, xvec11; +REVS_SX $0xe4, xvec7, xvec9, xvec9; MOV_SX xvec10, xvec7; -REVS_SX $0xe4, xvec8, xvec10; -REVS_SX $0xe4, xvec7, xvec8; +REVS_SX $0xe4, xvec8, xvec10, xvec10; +REVS_SX $0xe4, xvec7, xvec8, xvec8; #### Testing Alignment #### MOVQ C0, %rax; OR ldc, %rax; @@ -1127,14 +1107,14 @@ JNE .L10_loopEx; ALIGN_4 LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL -ADD_SX 0*SIZE(C0), xvec15; -ADD_SX 0*SIZE(C0, ldc, 1), xvec14; -ADD_SX 0*SIZE(C0, ldc, 2), xvec13; -ADD_SX 0*SIZE(C0, %rax, 1), xvec12; -ADD_SX 0*SIZE(C1), xvec11; -ADD_SX 0*SIZE(C1, ldc, 1), xvec10; -ADD_SX 0*SIZE(C1, ldc, 2), xvec9; -ADD_SX 0*SIZE(C1, %rax, 1), xvec8; +ADD_SX 0*SIZE(C0), xvec15, xvec15; +ADD_SX 0*SIZE(C0, ldc,1), xvec14, xvec14; +ADD_SX 0*SIZE(C0, ldc,2), xvec13, xvec13; +ADD_SX 0*SIZE(C0, %rax,1), xvec12, xvec12; +ADD_SX 0*SIZE(C1), xvec11, xvec11; +ADD_SX 0*SIZE(C1, ldc,1), xvec10, xvec10; +ADD_SX 0*SIZE(C1, ldc,2), xvec9, xvec9; +ADD_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; #endif ST_SX xvec15, 0*SIZE(C0); ST_SX xvec14, 0*SIZE(C0, ldc, 1); @@ -1161,30 +1141,30 @@ ALIGN_4 .L10_loopEx: LEAQ (ldc,ldc,2),%rax; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec7; -LDH_SX 2*SIZE(C0), xvec7; -LDL_SX 0*SIZE(C0, ldc, 1), xvec6; -LDH_SX 2*SIZE(C0, ldc, 1), xvec6; -LDL_SX 0*SIZE(C0, ldc, 2), xvec5; -LDH_SX 2*SIZE(C0, ldc, 2), xvec5; -LDL_SX 0*SIZE(C0, %rax, 1), xvec4; -LDH_SX 2*SIZE(C0, %rax, 1), xvec4; -LDL_SX 0*SIZE(C1), xvec3; -LDH_SX 2*SIZE(C1), xvec3; -LDL_SX 0*SIZE(C1, ldc, 1), xvec2; -LDH_SX 2*SIZE(C1, ldc, 1), xvec2; -LDL_SX 0*SIZE(C1, ldc, 2), xvec1; -LDH_SX 2*SIZE(C1, ldc, 2), xvec1; -LDL_SX 0*SIZE(C1, %rax, 1), xvec0; -LDH_SX 2*SIZE(C1, %rax, 1), xvec0; -ADD_SX xvec7, xvec15; -ADD_SX xvec6, xvec14; -ADD_SX xvec5, xvec13; -ADD_SX xvec4, xvec12; -ADD_SX xvec3, xvec11; -ADD_SX xvec2, xvec10; -ADD_SX xvec1, xvec9; -ADD_SX xvec0, xvec8; +LDL_SX 0*SIZE(C0), xvec7, xvec7; +LDH_SX 2*SIZE(C0), xvec7, xvec7; +LDL_SX 0*SIZE(C0, ldc, 1), xvec6, xvec6; +LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; +LDL_SX 0*SIZE(C0, ldc, 2), xvec5, xvec5; +LDH_SX 2*SIZE(C0, ldc, 2), xvec5, xvec5; +LDL_SX 0*SIZE(C0, %rax, 1), xvec4, xvec4; +LDH_SX 2*SIZE(C0, %rax, 1), xvec4, xvec4; +LDL_SX 0*SIZE(C1), xvec3, xvec3; +LDH_SX 2*SIZE(C1), xvec3, xvec3; +LDL_SX 0*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 2), xvec1, xvec1; +LDH_SX 2*SIZE(C1, ldc, 2), xvec1, xvec1; +LDL_SX 0*SIZE(C1, %rax, 1), xvec0, xvec0; +LDH_SX 2*SIZE(C1, %rax, 1), xvec0, xvec0; +ADD_SX xvec7, xvec15, xvec15; +ADD_SX xvec6, xvec14, xvec14; +ADD_SX xvec5, xvec13, xvec13; +ADD_SX xvec4, xvec12, xvec12; +ADD_SX xvec3, xvec11, xvec11; +ADD_SX xvec2, xvec10, xvec10; +ADD_SX xvec1, xvec9, xvec9; +ADD_SX xvec0, xvec8, xvec8; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -1258,63 +1238,63 @@ LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec6, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec6, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec6, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec6, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0x44, xvec0, xvec1; EDUP_SX 16*SIZE(ptrbb), xvec2; ODUP_SX 16*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 20*SIZE(ptrbb), xvec4; ODUP_SX 20*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 24*SIZE(ptrbb), xvec2; ODUP_SX 24*SIZE(ptrbb), xvec3; -MUL_SX xvec6, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec6, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 28*SIZE(ptrbb), xvec4; ODUP_SX 28*SIZE(ptrbb), xvec5; -MUL_SX xvec6, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec6, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; @@ -1334,32 +1314,32 @@ LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; SHUF_SX $0xee, xvec0, xvec6; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; -MUL_SX xvec6, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec6, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec6, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec6, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 12*SIZE(ptrbb), xvec4; ODUP_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec6, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec6, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec6, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec6, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -1376,40 +1356,40 @@ LD_SX 0*SIZE(ptrba), xvec0; # a1, a2, a3, a4 SHUF_SX $0x44, xvec0, xvec1; # a1, a2, a1, a2 EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; EDUP_SX 4*SIZE(ptrbb), xvec4; ODUP_SX 4*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L13_loopE: LEAQ (ldc,ldc,2),%rax; #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec11; -LDH_SX 0*SIZE(C0, ldc, 2), xvec11; -LDL_SX 0*SIZE(C0, ldc, 1), xvec10; -LDH_SX 0*SIZE(C0, %rax, 1), xvec10; -LDL_SX 0*SIZE(C1), xvec9; -LDH_SX 0*SIZE(C1, ldc, 2), xvec9; -LDL_SX 0*SIZE(C1, ldc, 1), xvec8; -LDH_SX 0*SIZE(C1, %rax, 1), xvec8; -ADD_SX xvec11, xvec15; -ADD_SX xvec10, xvec14; -ADD_SX xvec9, xvec13; -ADD_SX xvec8, xvec12; +LDL_SX 0*SIZE(C0), xvec11, xvec11; +LDH_SX 0*SIZE(C0, ldc, 2), xvec11, xvec11; +LDL_SX 0*SIZE(C0, ldc, 1), xvec10, xvec10; +LDH_SX 0*SIZE(C0, %rax, 1), xvec10, xvec10; +LDL_SX 0*SIZE(C1), xvec9, xvec9; +LDH_SX 0*SIZE(C1, ldc, 2), xvec9, xvec9; +LDL_SX 0*SIZE(C1, ldc, 1), xvec8, xvec8; +LDH_SX 0*SIZE(C1, %rax,1), xvec8, xvec8; +ADD_SX xvec11, xvec15, xvec15; +ADD_SX xvec10, xvec14, xvec14; +ADD_SX xvec9, xvec13, xvec13; +ADD_SX xvec8, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 0*SIZE(C0, ldc, 2); @@ -1471,35 +1451,35 @@ ALIGN_4 .L14_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; LD_SX 8*SIZE(ptrbb), xvec4; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; BROAD_SX 2*SIZE(ptrba), xvec0; LD_SX 16*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 20*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; BROAD_SX 3*SIZE(ptrba), xvec1; LD_SX 24*SIZE(ptrbb), xvec4; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; LD_SX 28*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $32*SIZE, ptrbb; @@ -1517,19 +1497,19 @@ ALIGN_4 .L15_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; BROAD_SX 1*SIZE(ptrba), xvec1; LD_SX 8*SIZE(ptrbb), xvec4; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; LD_SX 12*SIZE(ptrbb), xvec5; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -1544,18 +1524,18 @@ ALIGN_4 .L16_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $1, ptrba; ADDQ $4, ptrbb; .L16_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; LEAQ (ldc,ldc,2),%rax; SHUF_SX $0xff, xvec15, xvec13; @@ -1676,96 +1656,96 @@ ALIGN_4 ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 4*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 8*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 12*SIZE(ptrba), xvec1; #### Unroll time 2 #### ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 8*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 16*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 20*SIZE(ptrba), xvec1; #### Unroll time 3 #### ODUP_SX 8*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 12*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; ADDQ $16*SIZE, ptrbb; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 24*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 28*SIZE(ptrba), xvec1; ADDQ $32*SIZE, ptrba; @@ -1773,32 +1753,32 @@ ADDQ $32*SIZE, ptrba; ODUP_SX -4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 0*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 0*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 4*SIZE(ptrba), xvec1; DECQ k; JG .L211_bodyB; @@ -1816,33 +1796,33 @@ ALIGN_4 ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 4*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; ADDQ $8*SIZE, ptrbb; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 8*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 12*SIZE(ptrba), xvec1; ADDQ $16*SIZE, ptrba; @@ -1850,31 +1830,31 @@ ADDQ $16*SIZE, ptrba; ODUP_SX -4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; SHUF_SX $0x4e, xvec3, xvec5; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; EDUP_SX 0*SIZE(ptrbb), xvec2; MOV_SX xvec4, xvec6; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; LD_SX 0*SIZE(ptrba), xvec0; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; LD_SX 4*SIZE(ptrba), xvec1; .L212_loopE: @@ -1889,70 +1869,70 @@ ALIGN_4 ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; MOV_SX xvec2, xvec6; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; ADDQ $4*SIZE, ptrbb; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; MOV_SX xvec3, xvec7; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec13; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec13, xvec13; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec12; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec12, xvec12; MOV_SX xvec4, xvec6; ADDQ $8*SIZE, ptrba; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec11; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec10; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec11, xvec11; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec10, xvec10; MOV_SX xvec5, xvec7; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec9; -MUL_SX xvec1, xvec7; -ADD_SX xvec7, xvec8; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec9, xvec9; +MUL_SX xvec1, xvec7, xvec7; +ADD_SX xvec7, xvec8, xvec8; .L213_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; -MUL_SX xvec7, xvec11; -MUL_SX xvec7, xvec10; -MUL_SX xvec7, xvec9; -MUL_SX xvec7, xvec8; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; +MUL_SX xvec7, xvec11, xvec11; +MUL_SX xvec7, xvec10, xvec10; +MUL_SX xvec7, xvec9, xvec9; +MUL_SX xvec7, xvec8, xvec8; #### Writing Back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -LDL_SX 0*SIZE(C0, ldc, 1), xvec2; -LDH_SX 2*SIZE(C1, ldc, 1), xvec2; -LDL_SX 4*SIZE(C0, ldc, 1), xvec3; -LDH_SX 6*SIZE(C1, ldc, 1), xvec3; -LDL_SX 0*SIZE(C1), xvec4; -LDH_SX 2*SIZE(C0), xvec4; -LDL_SX 4*SIZE(C1), xvec5; -LDH_SX 6*SIZE(C0), xvec5; -LDL_SX 0*SIZE(C1, ldc, 1), xvec6; -LDH_SX 2*SIZE(C0, ldc, 1), xvec6; -LDL_SX 4*SIZE(C1, ldc, 1), xvec7; -LDH_SX 6*SIZE(C0, ldc, 1), xvec7; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; -ADD_SX xvec4, xvec11; -ADD_SX xvec5, xvec10; -ADD_SX xvec6, xvec9; -ADD_SX xvec7, xvec8; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 0*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_SX 2*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_SX 4*SIZE(C0, ldc, 1), xvec3, xvec3; +LDH_SX 6*SIZE(C1, ldc, 1), xvec3, xvec3; +LDL_SX 0*SIZE(C1), xvec4, xvec4; +LDH_SX 2*SIZE(C0), xvec4, xvec4; +LDL_SX 4*SIZE(C1), xvec5, xvec5; +LDH_SX 6*SIZE(C0), xvec5, xvec5; +LDL_SX 0*SIZE(C1, ldc, 1), xvec6, xvec6; +LDH_SX 2*SIZE(C0, ldc, 1), xvec6, xvec6; +LDL_SX 4*SIZE(C1, ldc, 1), xvec7, xvec7; +LDH_SX 6*SIZE(C0, ldc, 1), xvec7, xvec7; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; +ADD_SX xvec4, xvec11, xvec11; +ADD_SX xvec5, xvec10, xvec10; +ADD_SX xvec6, xvec9, xvec9; +ADD_SX xvec7, xvec8, xvec8; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2028,64 +2008,64 @@ EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 8*SIZE(ptrba), xvec0; EDUP_SX 8*SIZE(ptrbb), xvec2; ODUP_SX 8*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 12*SIZE(ptrba), xvec1; EDUP_SX 12*SIZE(ptrbb), xvec2; ODUP_SX 12*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15 +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15 SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -2106,32 +2086,32 @@ EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; LD_SX 4*SIZE(ptrba), xvec1; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec1, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec1, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec1, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec1, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec13 -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13 +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; .L222_loopE: @@ -2148,39 +2128,39 @@ EDUP_SX 0*SIZE(ptrbb), xvec2; ODUP_SX 0*SIZE(ptrbb), xvec3; SHUF_SX $0x4e, xvec2, xvec4; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; SHUF_SX $0x4e, xvec3, xvec5; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec13; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec12; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec13, xvec13; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec12, xvec12; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L223_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 0*SIZE(C0, ldc, 1), xvec1; -LDH_SX 2*SIZE(C1, ldc, 1), xvec1; -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C0), xvec2; -LDL_SX 0*SIZE(C1, ldc, 1), xvec3; -LDH_SX 2*SIZE(C0, ldc, 1), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 0*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_SX 2*SIZE(C1, ldc, 1), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C0), xvec2, xvec2; +LDL_SX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_SX 2*SIZE(C0, ldc, 1), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2242,35 +2222,35 @@ LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 4*SIZE(ptrbb), xvec6; ODUP_SX 4*SIZE(ptrbb), xvec7; -MUL_SX xvec2, xvec6; -ADD_SX xvec6, xvec15; -MUL_SX xvec2, xvec7; -ADD_SX xvec7, xvec14; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; EDUP_SX 8*SIZE(ptrbb), xvec4; ODUP_SX 8*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 12*SIZE(ptrbb), xvec6; ODUP_SX 12*SIZE(ptrbb), xvec7; -MUL_SX xvec2, xvec6; -ADD_SX xvec6, xvec15; -MUL_SX xvec2, xvec7; -ADD_SX xvec7, xvec14; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; @@ -2290,18 +2270,18 @@ LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; SHUF_SX $0xee, xvec0, xvec2; EDUP_SX 4*SIZE(ptrbb), xvec6; ODUP_SX 4*SIZE(ptrbb), xvec7; -MUL_SX xvec2, xvec6; -ADD_SX xvec6, xvec15; -MUL_SX xvec2, xvec7; -ADD_SX xvec7, xvec14; +MUL_SX xvec2, xvec6, xvec6; +ADD_SX xvec6, xvec15, xvec15; +MUL_SX xvec2, xvec7, xvec7; +ADD_SX xvec7, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2318,10 +2298,10 @@ LD_SX 0*SIZE(ptrba), xvec0; EDUP_SX 0*SIZE(ptrbb), xvec4; ODUP_SX 0*SIZE(ptrbb), xvec5; SHUF_SX $0x44, xvec0, xvec1; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec15; -MUL_SX xvec1, xvec5; -ADD_SX xvec5, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec15, xvec15; +MUL_SX xvec1, xvec5, xvec5; +ADD_SX xvec5, xvec14, xvec14; ADDQ $2*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2392,23 +2372,23 @@ ALIGN_4 .L241_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec2; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec2, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec2, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; BROAD_SX 2*SIZE(ptrba), xvec4; LD_SX 8*SIZE(ptrbb), xvec5; -MUL_SX xvec4, xvec5; -ADD_SX xvec5, xvec15; +MUL_SX xvec4, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; BROAD_SX 3*SIZE(ptrba), xvec6; LD_SX 12*SIZE(ptrbb), xvec7; -MUL_SX xvec6, xvec7; -ADD_SX xvec7, xvec15; +MUL_SX xvec6, xvec7, xvec7; +ADD_SX xvec7, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $16*SIZE, ptrbb; DECQ k; @@ -2425,13 +2405,13 @@ ALIGN_4 .L242_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; BROAD_SX 1*SIZE(ptrba), xvec2; LD_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec2, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec2, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2446,14 +2426,14 @@ ALIGN_4; .L243_bodyB: BROAD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $1*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; .L243_loopE: #### Multiply Alpha #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; +MUL_SX xvec7, xvec15, xvec15; SHUF_SX $0xff, xvec15, xvec14; SHUF_SX $0xaa, xvec15, xvec13; SHUF_SX $0x55, xvec15, xvec12; @@ -2546,34 +2526,34 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; @@ -2581,34 +2561,34 @@ LD_SX 16*SIZE(ptrba), xvec0; LD_SX 20*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 24*SIZE(ptrba), xvec0; LD_SX 28*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; ADDQ $32*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2630,34 +2610,34 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; SHUF_SX $0xfa, xvec2, xvec3; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 12*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2676,40 +2656,40 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrba), xvec1; MOV_SX xvec3, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; SHUF_SX $0x4e, xvec4, xvec5; -MUL_SX xvec1, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec1, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; MOV_SX xvec5, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec13; -MUL_SX xvec1, xvec6; -ADD_SX xvec6, xvec12; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec13, xvec13; +MUL_SX xvec1, xvec6, xvec6; +ADD_SX xvec6, xvec12, xvec12; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L313_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; -MUL_SX xvec7, xvec13; -MUL_SX xvec7, xvec12; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; +MUL_SX xvec7, xvec13, xvec13; +MUL_SX xvec7, xvec12, xvec12; #### Writing Back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 4*SIZE(C0), xvec1; -LDH_SX 6*SIZE(C1), xvec1; -LDL_SX 0*SIZE(C1), xvec2; -LDH_SX 2*SIZE(C0), xvec2; -LDL_SX 4*SIZE(C1), xvec3; -LDH_SX 6*SIZE(C0), xvec3; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; -ADD_SX xvec2, xvec13; -ADD_SX xvec3, xvec12; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 4*SIZE(C0), xvec1, xvec1; +LDH_SX 6*SIZE(C1), xvec1, xvec1; +LDL_SX 0*SIZE(C1), xvec2, xvec2; +LDH_SX 2*SIZE(C0), xvec2, xvec2; +LDL_SX 4*SIZE(C1), xvec3, xvec3; +LDH_SX 6*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; +ADD_SX xvec2, xvec13, xvec13; +ADD_SX xvec3, xvec12, xvec12; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2774,35 +2754,35 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec15; -MUL_SX xvec0, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; LD_SX 8*SIZE(ptrba), xvec0; LD_SX 4*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; LD_SX 12*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec15; -MUL_SX xvec0, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; ADDQ $16*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; @@ -2822,18 +2802,18 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; SHUF_SX $0xfa, xvec2, xvec5; SHUF_SX $0xaf, xvec2, xvec6; -MUL_SX xvec0, xvec5; -ADD_SX xvec5, xvec15; -MUL_SX xvec0, xvec6; -ADD_SX xvec6, xvec14; +MUL_SX xvec0, xvec5, xvec5; +ADD_SX xvec5, xvec15, xvec15; +MUL_SX xvec0, xvec6, xvec6; +ADD_SX xvec6, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2850,25 +2830,25 @@ LD_SX 0*SIZE(ptrba), xvec0; LD_SX 0*SIZE(ptrbb), xvec2; SHUF_SX $0x50, xvec2, xvec3; SHUF_SX $0x05, xvec2, xvec4; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec15; -MUL_SX xvec0, xvec4; -ADD_SX xvec4, xvec14; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec15, xvec15; +MUL_SX xvec0, xvec4, xvec4; +ADD_SX xvec4, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L323_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; #### Writing back #### #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C1), xvec0; -LDL_SX 0*SIZE(C1), xvec1; -LDH_SX 2*SIZE(C0), xvec1; -ADD_SX xvec0, xvec15; -ADD_SX xvec1, xvec14; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C1), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +LDH_SX 2*SIZE(C0), xvec1, xvec1; +ADD_SX xvec0, xvec15, xvec15; +ADD_SX xvec1, xvec14, xvec14; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C1); @@ -2928,19 +2908,19 @@ LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 -MUL_SX xvec0, xvec2; # c00, c10 -ADD_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; # C01, c11 -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; LD_SX 4*SIZE(ptrba), xvec0; EDUP_SX 4*SIZE(ptrbb), xvec2; ODUP_SX 4*SIZE(ptrbb), xvec3; -MUL_SX xvec0, xvec2; -ADD_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -2959,10 +2939,10 @@ LD_SX 0*SIZE(ptrba), xvec0; # a0, a1, a2, a3 EDUP_SX 0*SIZE(ptrbb), xvec2; # b0, b0, b2, b2 ODUP_SX 0*SIZE(ptrbb), xvec3; # b1, b1, b3, b3 -MUL_SX xvec0, xvec2; # c00, c10 -ADD_SX xvec2, xvec15; -MUL_SX xvec0, xvec3; # C01, c11 -ADD_SX xvec3, xvec14; +MUL_SX xvec0, xvec2, xvec2; +ADD_SX xvec2, xvec15, xvec15; +MUL_SX xvec0, xvec3, xvec3; +ADD_SX xvec3, xvec14, xvec14; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -2985,7 +2965,7 @@ mulss xvec0, xvec2; addss xvec2, xvec15; mulss xvec1, xvec3; SHUF_SX $0xe1, xvec3, xvec4; -ADD_SX xvec4, xvec15; +ADD_SX xvec4, xvec15, xvec15; movss 1*SIZE(ptrbb), xvec5; XOR_SY yvec6, yvec6, yvec6; @@ -2994,26 +2974,26 @@ mulss xvec0, xvec5; addss xvec5, xvec14; mulss xvec1, xvec6; SHUF_SX $0xe1, xvec6, xvec7; -ADD_SX xvec7, xvec14 +ADD_SX xvec7, xvec14, xvec14 ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; .L333_loopE: BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; -MUL_SX xvec7, xvec14; +MUL_SX xvec7, xvec15, xvec15; +MUL_SX xvec7, xvec14, xvec14; SHUF_SX $0xee, xvec15, xvec13; SHUF_SX $0xee, xvec14, xvec12; SHUF_SX $0x44, xvec15, xvec11; SHUF_SX $0x44, xvec14, xvec10; -ADD_SX xvec13, xvec11; -ADD_SX xvec12, xvec10; +ADD_SX xvec13, xvec11, xvec11; +ADD_SX xvec12, xvec10, xvec10; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDL_SX 0*SIZE(C1), xvec1; -ADD_SX xvec0, xvec11; -ADD_SX xvec1, xvec10; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDL_SX 0*SIZE(C1), xvec1, xvec1; +ADD_SX xvec0, xvec11, xvec11; +ADD_SX xvec1, xvec10, xvec10; #endif STL_SX xvec11, 0*SIZE(C0); STL_SX xvec10, 0*SIZE(C1); @@ -3305,14 +3285,14 @@ SHUF_SX $0xee, xvec15, xvec12; SHUF_SX $0x44, xvec14, xvec11; SHUF_SX $0xee, xvec14, xvec10; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDL_SX 2*SIZE(C0), xvec1; -LDL_SX 4*SIZE(C0), xvec2; -LDL_SX 6*SIZE(C0), xvec3; -ADD_SX xvec0, xvec13; -ADD_SX xvec1, xvec12; -ADD_SX xvec2, xvec11; -ADD_SX xvec3, xvec10; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDL_SX 2*SIZE(C0), xvec1, xvec1; +LDL_SX 4*SIZE(C0), xvec2, xvec2; +LDL_SX 6*SIZE(C0), xvec3, xvec3; +ADD_SX xvec0, xvec13, xvec13; +ADD_SX xvec1, xvec12, xvec12; +ADD_SX xvec2, xvec11, xvec11; +ADD_SX xvec3, xvec10, xvec10; #endif STL_SX xvec13, 0*SIZE(C0); STL_SX xvec12, 2*SIZE(C0); @@ -3368,23 +3348,23 @@ ALIGN_4 .L421_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 1*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 8*SIZE(ptrba), xvec0; BROAD_SX 2*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 12*SIZE(ptrba), xvec0; BROAD_SX 3*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $16*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; DECQ k; @@ -3401,13 +3381,13 @@ ALIGN_4 .L422_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; LD_SX 4*SIZE(ptrba), xvec0; BROAD_SX 1*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -3422,19 +3402,19 @@ ALIGN_4 .L423_bodyB: LD_SX 0*SIZE(ptrba), xvec0; BROAD_SX 0*SIZE(ptrbb), xvec1; -MUL_SX xvec0, xvec1; -ADD_SX xvec1, xvec15; +MUL_SX xvec0, xvec1, xvec1; +ADD_SX xvec1, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $1*SIZE, ptrbb; .L423_loopE: #### Writing back #### BROAD_SX MEMALPHA, xvec7; -MUL_SX xvec7, xvec15; +MUL_SX xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -LDL_SX 0*SIZE(C0), xvec0; -LDH_SX 2*SIZE(C0), xvec0; -ADD_SX xvec0, xvec15; +LDL_SX 0*SIZE(C0), xvec0, xvec0; +LDH_SX 2*SIZE(C0), xvec0, xvec0; +ADD_SX xvec0, xvec15, xvec15; #endif STL_SX xvec15, 0*SIZE(C0); STH_SX xvec15, 2*SIZE(C0); @@ -3485,37 +3465,37 @@ SARQ $2, k; JLE .L431_loopE; ALIGN_4 .L431_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrba), xvec1; -movss 0*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; -movss 2*SIZE(ptrba), xvec3; -movss 3*SIZE(ptrba), xvec4; -movss 1*SIZE(ptrbb), xvec5; -mulss xvec5, xvec3; -addss xvec3, xvec15; -mulss xvec5, xvec4; -addss xvec4, xvec14; +vmovss 2*SIZE(ptrba), xvec3; +vmovss 3*SIZE(ptrba), xvec4; +vmovss 1*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; -movss 4*SIZE(ptrba), xvec0; -movss 5*SIZE(ptrba), xvec1; -movss 2*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; +vmovss 4*SIZE(ptrba), xvec0; +vmovss 5*SIZE(ptrba), xvec1; +vmovss 2*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; -movss 6*SIZE(ptrba), xvec3; -movss 7*SIZE(ptrba), xvec4; -movss 3*SIZE(ptrbb), xvec5; -mulss xvec5, xvec3; -addss xvec3, xvec15; -mulss xvec5, xvec4; -addss xvec4, xvec14; +vmovss 6*SIZE(ptrba), xvec3; +vmovss 7*SIZE(ptrba), xvec4; +vmovss 3*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; addq $8*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; @@ -3530,21 +3510,21 @@ TEST $2, kkk; JLE .L432_loopE; ALIGN_4 .L432_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrba), xvec1; -movss 0*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; -movss 2*SIZE(ptrba), xvec3; -movss 3*SIZE(ptrba), xvec4; -movss 1*SIZE(ptrbb), xvec5; -mulss xvec5, xvec3; -addss xvec3, xvec15; -mulss xvec5, xvec4; -addss xvec4, xvec14; +vmovss 2*SIZE(ptrba), xvec3; +vmovss 3*SIZE(ptrba), xvec4; +vmovss 1*SIZE(ptrbb), xvec5; +vmulss xvec5, xvec3, xvec3; +vaddss xvec3, xvec15, xvec15; +vmulss xvec5, xvec4, xvec4; +vaddss xvec4, xvec14, xvec14; addq $4*SIZE, ptrba; addq $2*SIZE, ptrbb; @@ -3557,28 +3537,28 @@ TEST $1, kkk; JLE .L433_loopE; ALIGN_4 .L433_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrba), xvec1; -movss 0*SIZE(ptrbb), xvec2; -mulss xvec2, xvec0; -addss xvec0, xvec15; -mulss xvec2, xvec1; -addss xvec1, xvec14; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrba), xvec1; +vmovss 0*SIZE(ptrbb), xvec2; +vmulss xvec2, xvec0, xvec0; +vaddss xvec0, xvec15, xvec15; +vmulss xvec2, xvec1, xvec1; +vaddss xvec1, xvec14, xvec14; addq $2*SIZE, ptrba; addq $1*SIZE, ptrbb; .L433_loopE: #### Writing Back #### -movss MEMALPHA, xvec7; -mulss xvec7, xvec15; -mulss xvec7, xvec14; +vmovss MEMALPHA, xvec7; +vmulss xvec7, xvec15, xvec15; +vmulss xvec7, xvec14, xvec14; #ifndef TRMMKERNEL -addss 0*SIZE(C0), xvec15; -addss 1*SIZE(C0), xvec14; +vaddss 0*SIZE(C0), xvec15, xvec15; +vaddss 1*SIZE(C0), xvec14, xvec14; #endif -movss xvec15, 0*SIZE(C0); -movss xvec14, 1*SIZE(C0); +vmovss xvec15, 0*SIZE(C0); +vmovss xvec14, 1*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; @@ -3625,25 +3605,25 @@ SARQ $2, k; JLE .L441_loopE; ALIGN_4 .L441_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 0*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; -movss 1*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 1*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; -movss 2*SIZE(ptrba), xvec0; -movss 2*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 2*SIZE(ptrba), xvec0; +vmovss 2*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; -movss 3*SIZE(ptrba), xvec0; -movss 3*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 3*SIZE(ptrba), xvec0; +vmovss 3*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; addq $4*SIZE, ptrba; addq $4*SIZE, ptrbb; decq k; @@ -3658,15 +3638,15 @@ TEST $2, kkk; JLE .L442_loopE; ALIGN_4 .L442_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 0*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; -movss 1*SIZE(ptrba), xvec0; -movss 1*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 1*SIZE(ptrba), xvec0; +vmovss 1*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; addq $2*SIZE, ptrba; addq $2*SIZE, ptrbb; @@ -3679,21 +3659,21 @@ TEST $1, kkk; JLE .L443_loopE; ALIGN_4 .L443_bodyB: -movss 0*SIZE(ptrba), xvec0; -movss 0*SIZE(ptrbb), xvec1; -mulss xvec0, xvec1; -addss xvec1, xvec15; +vmovss 0*SIZE(ptrba), xvec0; +vmovss 0*SIZE(ptrbb), xvec1; +vmulss xvec0, xvec1, xvec1; +vaddss xvec1, xvec15, xvec15; addq $1*SIZE, ptrba; addq $1*SIZE, ptrbb; .L443_loopE: #### Writing Back #### -movss MEMALPHA, xvec7; -mulss xvec7, xvec15; +vmovss MEMALPHA, xvec7; +vmulss xvec7, xvec15, xvec15; #ifndef TRMMKERNEL -addss 0*SIZE(C0), xvec15; +vaddss 0*SIZE(C0), xvec15, xvec15; #endif -movss xvec15, 0*SIZE(C0); +vmovss xvec15, 0*SIZE(C0); #if (defined(TRMMKERNEL) && defined(LEFT) && defined(TRANSA)) ||(defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA)) MOVQ bk, %rax; SUBQ kkk, %rax; @@ -3711,6 +3691,7 @@ MOV bk, k; SALQ $2, k; ADDQ k, bb; ADDQ ldc, C; + .L40_loopE: movq 0(%rsp), %rbx; movq 8(%rsp), %rbp; @@ -3718,6 +3699,9 @@ movq 16(%rsp), %r12; movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; + +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi @@ -3732,6 +3716,7 @@ movq 40(%rsp), %r15; movups 192(%rsp), %xmm14 movups 208(%rsp), %xmm15 #endif + addq $STACKSIZE, %rsp; ret diff --git a/kernel/x86_64/zgemm_kernel_4x4_sandy.S b/kernel/x86_64/zgemm_kernel_4x4_sandy.S index f6f9f707f..9f6fb8a5f 100644 --- a/kernel/x86_64/zgemm_kernel_4x4_sandy.S +++ b/kernel/x86_64/zgemm_kernel_4x4_sandy.S @@ -148,74 +148,49 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #undef MOVQ #define MOVQ movq -#define XOR_SY vxorps #define XOR_DY vxorpd -#define XOR_SX xorps -#define XOR_DX xorpd +#define XOR_DX vxorpd -#define LD_SY vmovaps #define LD_DY vmovapd -#define LD_SX movaps -#define LD_DX movapd +#define LD_DX vmovapd #define LDL_DY vmovlpd -#define LDL_DX movlpd +#define LDL_DX vmovlpd #define LDH_DY vmovhpd -#define LDH_DX movhpd +#define LDH_DX vmovhpd -#define ST_SY vmovaps #define ST_DY vmovapd -#define ST_SX movaps -#define ST_DX movapd +#define ST_DX vmovapd #define STL_DY vmovlpd -#define STL_DX movlpd +#define STL_DX vmovlpd #define STH_DY vmovhpd -#define STH_DX movhpd +#define STH_DX vmovhpd -#define EDUP_SY vmovsldup -#define ODUP_SY vmovshdup -#define EDUP_SX movsldup -#define ODUP_SX movshdup #define EDUP_DY vmovddup -#define ADD_SY vaddps #define ADD_DY vaddpd -#define ADD_SX addps -#define ADD_DX addpd +#define ADD_DX vaddpd #define SUB_DY vsubpd -#define SUB_DX subpd +#define SUB_DX vsubpd #define ADDSUB_DY vaddsubpd -#define ADDSUB_DX addsubpd -#define ADDSUB_SY vaddsubps +#define ADDSUB_DX vaddsubpd -#define MUL_SY vmulps #define MUL_DY vmulpd -#define MUL_SX mulps -#define MUL_DX mulpd +#define MUL_DX vmulpd -#define SHUF_SY vperm2f128 #define SHUF_DY vperm2f128 -#define SHUF_DX pshufd -#define SHUF_SX pshufd +#define SHUF_DX vpshufd -#define VPERMILP_SY vpermilps -#define VPERMILP_SX vpermilps #define VPERMILP_DY vpermilpd -#define BROAD_SY vbroadcastss #define BROAD_DY vbroadcastsd -#define BROAD_SX vbroadcastss -#define BROAD_DX movddup +#define BROAD_DX vmovddup -#define MOV_SY vmovaps #define MOV_DY vmovapd -#define MOV_SX movaps -#define MOV_DX movapd +#define MOV_DX vmovapd -#define REVS_SY vshufps #define REVS_DY vshufpd -#define REVS_SX shufps -#define REVS_DX movsd +#define REVS_DX vmovsd #define EXTRA_DY vextractf128 @@ -282,6 +257,8 @@ movq old_offset, %r11; #endif #endif +vzeroupper + vmovlps %xmm0, MEMALPHA_R vmovlps %xmm1, MEMALPHA_I movq old_bm, bm @@ -1373,14 +1350,14 @@ EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $1, yvec12, xvec4; #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0), xvec15; -ADD_DX 2*SIZE(C0, ldc, 1), xvec7; -ADD_DX 0*SIZE(C0, ldc, 1), xvec13; -ADD_DX 2*SIZE(C0), xvec5; -ADD_DX 0*SIZE(C1), xvec14; -ADD_DX 2*SIZE(C1, ldc, 1), xvec6; -ADD_DX 0*SIZE(C1, ldc, 1), xvec12; -ADD_DX 2*SIZE(C1), xvec4; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C0, ldc, 1), xvec7, xvec7; +ADD_DX 0*SIZE(C0, ldc, 1), xvec13, xvec13; +ADD_DX 2*SIZE(C0), xvec5, xvec5; +ADD_DX 0*SIZE(C1), xvec14, xvec14; +ADD_DX 2*SIZE(C1, ldc, 1), xvec6, xvec6; +ADD_DX 0*SIZE(C1, ldc, 1), xvec12, xvec12; +ADD_DX 2*SIZE(C1), xvec4, xvec4; #endif ST_DX xvec15, 0*SIZE(C0); ST_DX xvec7, 2*SIZE(C0, ldc, 1); @@ -1410,18 +1387,18 @@ EXTRA_DY $1, yvec14, xvec6; EXTRA_DY $1, yvec13, xvec5; EXTRA_DY $2, yvec12, xvec4; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0, ldc, 1), xvec1; -LDH_DX 3*SIZE(C0, ldc, 1), xvec1; -LDL_DX 0*SIZE(C0, ldc, 1), xvec2; -LDH_DX 1*SIZE(C0, ldc, 1), xvec2; -LDL_DX 2*SIZE(C0), xvec3; -LDH_DX 3*SIZE(C0), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec13; -ADD_DX xvec3, xvec5; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DX 3*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C0, ldc, 1), xvec2, xvec2; +LDH_DX 1*SIZE(C0, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C0), xvec3, xvec3; +LDH_DX 3*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec13, xvec13; +ADD_DX xvec3, xvec5, xvec5; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -1432,18 +1409,18 @@ STH_DX xvec13, 1*SIZE(C0, ldc, 1); STL_DX xvec6, 2*SIZE(C0); STH_DX xvec6, 3*SIZE(C0); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec0; -LDH_DX 1*SIZE(C1), xvec0; -LDL_DX 2*SIZE(C1, ldc, 1), xvec1; -LDH_DX 3*SIZE(C1, ldc, 1), xvec1; -LDL_DX 0*SIZE(C1, ldc, 1), xvec2; -LDH_DX 1*SIZE(C1, ldc, 1), xvec2; -LDL_DX 2*SIZE(C1), xvec3; -LDH_DX 3*SIZE(C1), xvec3; -ADD_DX xvec0, xvec14; -ADD_DX xvec1, xvec6; -ADD_DX xvec2, xvec12; -ADD_DX xvec3, xvec4; +LDL_DX 0*SIZE(C1), xvec0, xvec0; +LDH_DX 1*SIZE(C1), xvec0, xvec0; +LDL_DX 2*SIZE(C1, ldc, 1), xvec1, xvec1; +LDH_DX 3*SIZE(C1, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C1, ldc, 1), xvec2, xvec2; +LDH_DX 1*SIZE(C1, ldc, 1), xvec2, xvec2; +LDL_DX 2*SIZE(C1), xvec3, xvec3; +LDH_DX 3*SIZE(C1), xvec3, xvec3; +ADD_DX xvec0, xvec14, xvec14; +ADD_DX xvec1, xvec6, xvec6; +ADD_DX xvec2, xvec12, xvec12; +ADD_DX xvec3, xvec4, xvec4; #endif STL_DX xvec14, 0*SIZE(C1); STH_DX xvec14, 1*SIZE(C1); @@ -1680,18 +1657,18 @@ ADD2_DY yvec4, yvec14, yvec14; EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 0*SIZE(C0, ldc, 1), xvec1; -LDH_DX 1*SIZE(C0, ldc, 1), xvec1; -LDL_DX 0*SIZE(C1), xvec2; -LDH_DX 1*SIZE(C1), xvec2; -LDL_DX 0*SIZE(C1, ldc, 1), xvec3; -LDH_DX 1*SIZE(C1, ldc, 1), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec14; -ADD_DX xvec3, xvec6; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 0*SIZE(C0, ldc, 1), xvec1, xvec1; +LDH_DX 1*SIZE(C0, ldc, 1), xvec1, xvec1; +LDL_DX 0*SIZE(C1), xvec2, xvec2; +LDH_DX 1*SIZE(C1), xvec2, xvec2; +LDL_DX 0*SIZE(C1, ldc, 1), xvec3, xvec3; +LDH_DX 1*SIZE(C1, ldc, 1), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2063,14 +2040,14 @@ JNE .L213_loopEx; ALIGN_5 #### Writing back #### #ifndef TRMMKERNEL -ADD_DX 0*SIZE(C0),xvec15; -ADD_DX 2*SIZE(C1),xvec7; -ADD_DX 4*SIZE(C0),xvec14; -ADD_DX 6*SIZE(C1),xvec6; -ADD_DX 0*SIZE(C1),xvec13; -ADD_DX 2*SIZE(C0),xvec5; -ADD_DX 4*SIZE(C1),xvec12; -ADD_DX 6*SIZE(C0),xvec4; +ADD_DX 0*SIZE(C0), xvec15, xvec15; +ADD_DX 2*SIZE(C1), xvec7, xvec7; +ADD_DX 4*SIZE(C0), xvec14, xvec14; +ADD_DX 6*SIZE(C1), xvec6, xvec6; +ADD_DX 0*SIZE(C1), xvec13, xvec13; +ADD_DX 2*SIZE(C0), xvec5, xvec5; +ADD_DX 4*SIZE(C1), xvec12, xvec12; +ADD_DX 6*SIZE(C0), xvec4, xvec4; #endif ST_DX xvec15,0*SIZE(C0); ST_DX xvec7,2*SIZE(C1); @@ -2098,18 +2075,18 @@ JMP .L21_loopE; ALIGN_5 .L213_loopEx: #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C1), xvec1; -LDH_DX 3*SIZE(C1), xvec1; -LDL_DX 4*SIZE(C0), xvec2; -LDH_DX 5*SIZE(C0), xvec2; -LDL_DX 6*SIZE(C1), xvec3; -LDH_DX 7*SIZE(C1), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec14; -ADD_DX xvec3, xvec6; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C1), xvec1, xvec1; +LDH_DX 3*SIZE(C1), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C1), xvec3, xvec3; +LDH_DX 7*SIZE(C1), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2120,18 +2097,18 @@ STH_DX xvec14, 5*SIZE(C0); STL_DX xvec6, 6*SIZE(C1); STH_DX xvec6, 7*SIZE(C1); #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C1), xvec3; -LDH_DX 1*SIZE(C1), xvec3; -LDL_DX 2*SIZE(C0), xvec2; -LDH_DX 3*SIZE(C0), xvec2; -LDL_DX 4*SIZE(C1), xvec1; -LDH_DX 5*SIZE(C1), xvec1; -LDL_DX 6*SIZE(C0), xvec0; -LDH_DX 7*SIZE(C0), xvec0; -ADD_DX xvec3, xvec13; -ADD_DX xvec2, xvec5; -ADD_DX xvec1, xvec12; -ADD_DX xvec0, xvec4; +LDL_DX 0*SIZE(C1), xvec3, xvec3; +LDH_DX 1*SIZE(C1), xvec3, xvec3; +LDL_DX 2*SIZE(C0), xvec2, xvec2; +LDH_DX 3*SIZE(C0), xvec2, xvec2; +LDL_DX 4*SIZE(C1), xvec1, xvec1; +LDH_DX 5*SIZE(C1), xvec1, xvec1; +LDL_DX 6*SIZE(C0), xvec0, xvec0; +LDH_DX 7*SIZE(C0), xvec0, xvec0; +ADD_DX xvec3, xvec13, xvec13; +ADD_DX xvec2, xvec5, xvec5; +ADD_DX xvec1, xvec12, xvec12; +ADD_DX xvec0, xvec4, xvec4; #endif STL_DX xvec13, 0*SIZE(C1); STH_DX xvec13, 1*SIZE(C1); @@ -2384,18 +2361,18 @@ EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec13, xvec5; #### Write back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C1), xvec1; -LDH_DX 3*SIZE(C1), xvec1; -LDL_DX 0*SIZE(C1), xvec2; -LDH_DX 1*SIZE(C1), xvec2; -LDL_DX 2*SIZE(C0), xvec3; -LDH_DX 3*SIZE(C0), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec13; -ADD_DX xvec3, xvec5; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C1), xvec1, xvec1; +LDH_DX 3*SIZE(C1), xvec1, xvec1; +LDL_DX 0*SIZE(C1), xvec2, xvec2; +LDH_DX 1*SIZE(C1), xvec2, xvec2; +LDL_DX 2*SIZE(C0), xvec3, xvec3; +LDH_DX 3*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec13, xvec13; +ADD_DX xvec3, xvec5, xvec5; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2582,12 +2559,12 @@ ADD2_DY yvec5, yvec15, yvec15; EXTRA_DY $1, yvec15, xvec7; #### Writing Back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 0*SIZE(C1), xvec1; -LDH_DX 1*SIZE(C1), xvec1; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 0*SIZE(C1), xvec1, xvec1; +LDH_DX 1*SIZE(C1), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -2845,18 +2822,18 @@ EXTRA_DY $1, yvec15, xvec7; EXTRA_DY $1, yvec14, xvec6; #### Writing Back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -LDL_DX 4*SIZE(C0), xvec2; -LDH_DX 5*SIZE(C0), xvec2; -LDL_DX 6*SIZE(C0), xvec3; -LDH_DX 7*SIZE(C0), xvec3; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; -ADD_DX xvec2, xvec14; -ADD_DX xvec3, xvec6; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +LDL_DX 4*SIZE(C0), xvec2, xvec2; +LDH_DX 5*SIZE(C0), xvec2, xvec2; +LDL_DX 6*SIZE(C0), xvec3, xvec3; +LDH_DX 7*SIZE(C0), xvec3, xvec3; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; +ADD_DX xvec2, xvec14, xvec14; +ADD_DX xvec3, xvec6, xvec6; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3026,12 +3003,12 @@ ADD2_DY yvec5, yvec15, yvec15; EXTRA_DY $1, yvec15, xvec7; #### Writing Back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -LDL_DX 2*SIZE(C0), xvec1; -LDH_DX 3*SIZE(C0), xvec1; -ADD_DX xvec0, xvec15; -ADD_DX xvec1, xvec7; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +LDL_DX 2*SIZE(C0), xvec1, xvec1; +LDH_DX 3*SIZE(C0), xvec1, xvec1; +ADD_DX xvec0, xvec15, xvec15; +ADD_DX xvec1, xvec7, xvec7; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3084,43 +3061,43 @@ ALIGN_5 .L331_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec0; BROAD_DX 2*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 3*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 4*SIZE(ptrba), xvec0; BROAD_DX 4*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 5*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 6*SIZE(ptrba), xvec0; BROAD_DX 6*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 7*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $8*SIZE, ptrba; ADDQ $8*SIZE, ptrbb; DECQ k; @@ -3137,23 +3114,23 @@ ALIGN_5 .L332_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; LD_DX 2*SIZE(ptrba), xvec0; BROAD_DX 2*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 3*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $4*SIZE, ptrba; ADDQ $4*SIZE, ptrbb; @@ -3168,13 +3145,13 @@ ALIGN_5 .L333_bodyB: LD_DX 0*SIZE(ptrba), xvec0; BROAD_DX 0*SIZE(ptrbb), xvec2; -MUL_DX xvec0, xvec2; -ADD1_DX xvec2, xvec15; +MUL_DX xvec0, xvec2, xvec2; +ADD1_DX xvec2, xvec15, xvec15; SHUF_DX $0x4e, xvec0, xvec1; BROAD_DX 1*SIZE(ptrbb), xvec3; -MUL_DX xvec1, xvec3; -ADDSUB_DX xvec3, xvec15; +MUL_DX xvec1, xvec3, xvec3; +ADDSUB_DX xvec3, xvec15, xvec15; ADDQ $2*SIZE, ptrba; ADDQ $2*SIZE, ptrbb; @@ -3182,14 +3159,14 @@ ADDQ $2*SIZE, ptrbb; #### Handle #### XOR_DY yvec7, yvec7, yvec7; #if defined(RN) || defined(RT) || defined(CN) || defined(CT) -ADDSUB_DX xvec15, xvec7; +ADDSUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; #elif defined(NR) || defined(NC) || defined(TR) || defined(TC) -SUB_DX xvec15, xvec7; +SUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; #elif defined(RR) || defined(RC) || defined(CR) || defined(CC) SHUF_DX $0x4e, xvec15, xvec15; -ADDSUB_DX xvec15, xvec7; +ADDSUB_DX xvec15, xvec7, xvec7; MOV_DX xvec7, xvec15; SHUF_DX $0x4e, xvec15, xvec15; #endif @@ -3199,14 +3176,14 @@ BROAD_DX MEMALPHA_R,xvec7; BROAD_DX MEMALPHA_I,xvec6; #### Multiply Alpha #### SHUF_DX $0x4e, xvec15, xvec5; -MUL_DX xvec7, xvec15; -MUL_DX xvec6, xvec5; -ADDSUB_DX xvec5, xvec15; +MUL_DX xvec7, xvec15, xvec15; +MUL_DX xvec6, xvec5, xvec5; +ADDSUB_DX xvec5, xvec15, xvec15; #### Writing back #### #ifndef TRMMKERNEL -LDL_DX 0*SIZE(C0), xvec0; -LDH_DX 1*SIZE(C0), xvec0; -ADD_DX xvec0, xvec15; +LDL_DX 0*SIZE(C0), xvec0, xvec0; +LDH_DX 1*SIZE(C0), xvec0, xvec0; +ADD_DX xvec0, xvec15, xvec15; #endif STL_DX xvec15, 0*SIZE(C0); STH_DX xvec15, 1*SIZE(C0); @@ -3237,6 +3214,9 @@ movq 24(%rsp), %r13; movq 32(%rsp), %r14; movq 40(%rsp), %r15; + +vzeroupper + #ifdef WINDOWS_ABI movq 48(%rsp), %rdi movq 56(%rsp), %rsi From 13f5f181406df3de4553d2481206df1b19a99b4a Mon Sep 17 00:00:00 2001 From: Zhang Xianyi Date: Tue, 26 Jun 2012 07:43:06 +0800 Subject: [PATCH 46/46] Updated the doc for 0.2.0 version. --- Changelog.txt | 15 +++++++++++++++ README.md | 6 +++--- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/Changelog.txt b/Changelog.txt index 0ed35b0e4..c222c7eee 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,17 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.2.0 +26-Jun-2012 +common: + * Removed the limitation (64) of numbers of CPU cores. + Now, it supports 256 cores at max. + * Supported clang compiler. + * Fixed some build bugs on FreeBSD +x86/x86-64: + * Optimized Level-3 BLAS on Intel Sandy Bridge x86-64 by AVX instructions. + Please use gcc >= 4.6 or clang >=3.1. + * Support AMD Bobcat by using GotoBLAS2 AMD Barcelona codes. + ==================================================================== Version 0.1.1 29-Apr-2012 @@ -7,6 +20,8 @@ common: * Supported LAPACKE, a C interface to LAPACKE. (Thank Zaheer Chothia) * Fixed the build bug (MD5 and download) on Mac OSX. * Auto download CUnit 2.1.2-2 from SF.net with UTEST_CHECK=1. + * Fxied the compatibility issue for compilers without C99 complex number + (e.g. Visual Studio) x86/x86_64: * Auto-detect Intel Sandy Bridge Core i7-3xxx & Xeon E7 Westmere-EX. * Test alpha=Nan in dscale. diff --git a/README.md b/README.md index a13e069ec..82e9f528c 100644 --- a/README.md +++ b/README.md @@ -5,7 +5,7 @@ OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version. OpenB Please read the documents on OpenBLAS wiki pages . -## Intallation +## Installation Download from project homepage. http://xianyi.github.com/OpenBLAS/ Or, check out codes from git://github.com/xianyi/OpenBLAS.git @@ -76,9 +76,9 @@ The priorities are OPENBLAS_NUM_THREADS > GOTO_NUM_THREADS > OMP_NUM_THREADS. If you compile this lib with USE_OPENMP=1, you should set OMP_NUM_THREADS environment variable. OpenBLAS ignores OPENBLAS_NUM_THREADS and GOTO_NUM_THREADS with USE_OPENMP=1. -### Set the number of threads with calling functions. +### Set the number of threads on runtime. -Examples: +We provided the below functions to controll the number of threads on runtime. So far, we didn't support changing the number of threads on Windows. On Windows, these functions are dummy. void goto_set_num_threads(int num_threads);