Merge branch 'bulldozer' into develop
This commit is contained in:
		
						commit
						c0b1e41bec
					
				|  | @ -324,16 +324,14 @@ ifeq ($(ARCH), x86) | |||
| DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
 | ||||
| 	       CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE  | ||||
| #BULLDOZER PILEDRIVER
 | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | ||||
| endif | ||||
| endif | ||||
| 
 | ||||
| ifeq ($(ARCH), x86_64) | ||||
| DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO | ||||
| ifneq ($(NO_AVX), 1) | ||||
| DYNAMIC_CORE += SANDYBRIDGE  | ||||
| #BULLDOZER PILEDRIVER
 | ||||
| DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER | ||||
| endif | ||||
| endif | ||||
| 
 | ||||
|  |  | |||
							
								
								
									
										8
									
								
								cpuid.h
								
								
								
								
							
							
						
						
									
										8
									
								
								cpuid.h
								
								
								
								
							|  | @ -105,8 +105,8 @@ | |||
| #define CORE_NANO	19 | ||||
| #define CORE_SANDYBRIDGE 20 | ||||
| #define CORE_BOBCAT     21 | ||||
| #define CORE_BULLDOZER CORE_BARCELONA | ||||
| #define CORE_PILEDRIVER CORE_BARCELONA | ||||
| #define CORE_BULLDOZER  22 | ||||
| #define CORE_PILEDRIVER  23 | ||||
| #define CORE_HASWELL CORE_SANDYBRIDGE | ||||
| 
 | ||||
| #define HAVE_SSE      (1 <<  0) | ||||
|  | @ -198,8 +198,8 @@ typedef struct { | |||
| #define CPUTYPE_NANO			43 | ||||
| #define CPUTYPE_SANDYBRIDGE             44 | ||||
| #define CPUTYPE_BOBCAT                  45 | ||||
| #define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA | ||||
| #define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA | ||||
| #define CPUTYPE_BULLDOZER               46 | ||||
| #define CPUTYPE_PILEDRIVER              47 | ||||
| // this define is because BLAS doesn't have haswell specific optimizations yet
 | ||||
| #define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE  | ||||
| 
 | ||||
|  |  | |||
|  | @ -63,16 +63,14 @@ extern gotoblas_t  gotoblas_BARCELONA; | |||
| extern gotoblas_t  gotoblas_BOBCAT; | ||||
| #ifndef NO_AVX | ||||
| extern gotoblas_t  gotoblas_SANDYBRIDGE; | ||||
| //extern gotoblas_t  gotoblas_BULLDOZER;
 | ||||
| //extern gotoblas_t  gotoblas_PILEDRIVER;
 | ||||
| extern gotoblas_t  gotoblas_BULLDOZER; | ||||
| extern gotoblas_t  gotoblas_PILEDRIVER; | ||||
| #else | ||||
| //Use NEHALEM kernels for sandy bridge
 | ||||
| #define gotoblas_SANDYBRIDGE gotoblas_NEHALEM | ||||
| #endif | ||||
| 
 | ||||
| #define gotoblas_BULLDOZER gotoblas_BARCELONA | ||||
| #define gotoblas_PILEDRIVER gotoblas_BARCELONA | ||||
| 
 | ||||
| #endif | ||||
| //Use sandy bridge kernels for haswell.
 | ||||
| #define gotoblas_HASWELL gotoblas_SANDYBRIDGE | ||||
| 
 | ||||
|  |  | |||
|  | @ -494,7 +494,7 @@ static void disable_affinity(void) { | |||
| 
 | ||||
| #ifndef USE_OPENMP | ||||
|   for(i=0; i< count; i++){ | ||||
|     lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i]; | ||||
|     lprocmask[i] &= common->avail[i]; | ||||
|   } | ||||
| #endif | ||||
| 
 | ||||
|  | @ -754,7 +754,7 @@ void gotoblas_affinity_init(void) { | |||
|     if (common -> num_nodes > 1) numa_mapping(); | ||||
| 
 | ||||
|     common -> final_num_procs = 0; | ||||
|     for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]); | ||||
|     for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1;   //Make the max cpu number.
 | ||||
| 
 | ||||
|     for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] =  0; | ||||
| 
 | ||||
|  |  | |||
|  | @ -354,7 +354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME  "OPTERON" | ||||
| #endif | ||||
| 
 | ||||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_PILEDRIVER) || defined (FORCE_BULLDOZER) | ||||
| #if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) | ||||
| #define FORCE | ||||
| #define FORCE_INTEL | ||||
| #define ARCHITECTURE    "X86" | ||||
|  | @ -384,7 +384,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME  "BOBCAT" | ||||
| #endif | ||||
| 
 | ||||
| #if 0 | ||||
| #if defined (FORCE_BULLDOZER) | ||||
| #define FORCE | ||||
| #define FORCE_INTEL | ||||
| #define ARCHITECTURE    "X86" | ||||
|  | @ -400,7 +400,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| #define CORENAME  "BULLDOZER" | ||||
| #endif | ||||
| 
 | ||||
| #if 0 
 | ||||
| #if defined (FORCE_PILEDRIVER) | ||||
| #define FORCE | ||||
| #define FORCE_INTEL | ||||
| #define ARCHITECTURE    "X86" | ||||
|  |  | |||
|  | @ -582,6 +582,24 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | |||
| 
 | ||||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| else | ||||
| 
 | ||||
| ifdef STRMMKERNEL | ||||
| 
 | ||||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | ||||
| 
 | ||||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| 
 | ||||
| 
 | ||||
| else | ||||
| $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
|  | @ -595,17 +613,79 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | |||
| $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| 
 | ||||
| endif | ||||
| 
 | ||||
| ifdef DTRMMKERNEL | ||||
| 
 | ||||
| ifdef DTRMMKERNEL_LN | ||||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef DTRMMKERNEL_LT | ||||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef DTRMMKERNEL_RN | ||||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef DTRMMKERNEL_RT | ||||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| else | ||||
| 
 | ||||
| ifdef DTRMMKERNEL_LN | ||||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef DTRMMKERNEL_LT | ||||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef DTRMMKERNEL_RN | ||||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| ifdef DTRMMKERNEL_RT | ||||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| else | ||||
| $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| endif | ||||
| 
 | ||||
| endif | ||||
| 
 | ||||
| ifdef QTRMMKERNEL | ||||
| 
 | ||||
| $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
|  | @ -619,6 +699,50 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | |||
| $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| 
 | ||||
| else | ||||
| 
 | ||||
| $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ | ||||
| 
 | ||||
| $(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ | ||||
| 
 | ||||
| $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ | ||||
| 
 | ||||
| $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ | ||||
| 
 | ||||
| endif | ||||
| 
 | ||||
| ifdef CTRMMKERNEL | ||||
| 
 | ||||
| $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| else | ||||
| 
 | ||||
| $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
|  | @ -643,6 +767,37 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) | |||
| $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| endif | ||||
| 
 | ||||
| ifdef ZTRMMKERNEL | ||||
| 
 | ||||
| $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| 
 | ||||
| else | ||||
| 
 | ||||
| $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
|  | @ -666,7 +821,37 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | |||
| 
 | ||||
| $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| endif | ||||
| endif | ||||
| 
 | ||||
| ifdef XTRMMKERNEL | ||||
| 
 | ||||
| $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@ | ||||
| 
 | ||||
| $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| else | ||||
| 
 | ||||
| $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ | ||||
|  | @ -692,6 +877,9 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | |||
| $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ | ||||
| 
 | ||||
| endif | ||||
| 
 | ||||
| 
 | ||||
| $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) | ||||
| 	$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ | ||||
| 
 | ||||
|  |  | |||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -0,0 +1,750 @@ | |||
| #include "common.h" | ||||
| 
 | ||||
| int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)  | ||||
| { | ||||
| 
 | ||||
|    BLASLONG i,j,k; | ||||
|    FLOAT *C0,*C1,*ptrba,*ptrbb; | ||||
| 
 | ||||
|    FLOAT res0_0; | ||||
|    FLOAT res0_1; | ||||
|    FLOAT res0_2; | ||||
|    FLOAT res0_3; | ||||
|    FLOAT res0_4; | ||||
|    FLOAT res0_5; | ||||
|    FLOAT res0_6; | ||||
|    FLOAT res0_7; | ||||
| 
 | ||||
|    FLOAT res1_0; | ||||
|    FLOAT res1_1; | ||||
|    FLOAT res1_2; | ||||
|    FLOAT res1_3; | ||||
|    FLOAT res1_4; | ||||
|    FLOAT res1_5; | ||||
|    FLOAT res1_6; | ||||
|    FLOAT res1_7; | ||||
| 
 | ||||
|    FLOAT a0; | ||||
|    FLOAT a1; | ||||
| 
 | ||||
|    FLOAT b0; | ||||
|    FLOAT b1; | ||||
| 
 | ||||
|    BLASLONG off, temp; | ||||
| 
 | ||||
| #if !defined(LEFT) | ||||
|    off = -offset;  | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|    for (j=0; j<bn/2; j+=1)  | ||||
|    { | ||||
|         C0 = C; | ||||
|         C1 = C0+ldc; | ||||
| 
 | ||||
| #if defined(TRMMKERNEL) && defined(LEFT) | ||||
| 		off = offset; | ||||
| #endif | ||||
| 
 | ||||
| 
 | ||||
|         ptrba = ba; | ||||
| 
 | ||||
|         for (i=0; i<bm/8; i+=1)  | ||||
|         { | ||||
| 
 | ||||
| #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| 		ptrbb = bb; | ||||
| #else | ||||
| 		ptrba += off*8; | ||||
| 		ptrbb = bb + off*2; | ||||
| #endif | ||||
| 
 | ||||
| 		res0_0 = 0; | ||||
| 		res0_1 = 0; | ||||
| 		res0_2 = 0; | ||||
| 		res0_3 = 0; | ||||
| 		res0_4 = 0; | ||||
| 		res0_5 = 0; | ||||
| 		res0_6 = 0; | ||||
| 		res0_7 = 0; | ||||
| 
 | ||||
| 		res1_0 = 0; | ||||
| 		res1_1 = 0; | ||||
| 		res1_2 = 0; | ||||
| 		res1_3 = 0; | ||||
| 		res1_4 = 0; | ||||
| 		res1_5 = 0; | ||||
| 		res1_6 = 0; | ||||
| 		res1_7 = 0; | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | ||||
| 		temp = bk-off; | ||||
| #elif defined(LEFT)  | ||||
| 		temp = off+8;	// number of values in A
 | ||||
| #else | ||||
| 		temp = off+2;	// number of values in B
 | ||||
| #endif | ||||
| 
 | ||||
| 		for (k=0; k<temp; k++)  | ||||
|                 { | ||||
| 			b0 = ptrbb[0]; | ||||
| 			b1 = ptrbb[1]; | ||||
| 
 | ||||
| 			a0 = ptrba[0]; | ||||
| 			res0_0 += a0*b0; | ||||
| 			res1_0 += a0*b1; | ||||
| 
 | ||||
| 			a1 = ptrba[1]; | ||||
| 			res0_1 += a1*b0; | ||||
| 			res1_1 += a1*b1; | ||||
| 
 | ||||
| 			a0 = ptrba[2]; | ||||
| 			res0_2 += a0*b0; | ||||
| 			res1_2 += a0*b1; | ||||
| 
 | ||||
| 			a1 = ptrba[3]; | ||||
| 			res0_3 += a1*b0; | ||||
| 			res1_3 += a1*b1; | ||||
| 
 | ||||
| 			a0 = ptrba[4]; | ||||
| 			res0_4 += a0*b0; | ||||
| 			res1_4 += a0*b1; | ||||
| 
 | ||||
| 			a1 = ptrba[5]; | ||||
| 			res0_5 += a1*b0; | ||||
| 			res1_5 += a1*b1; | ||||
| 
 | ||||
| 			a0 = ptrba[6]; | ||||
| 			res0_6 += a0*b0; | ||||
| 			res1_6 += a0*b1; | ||||
| 
 | ||||
| 			a1 = ptrba[7]; | ||||
| 			res0_7 += a1*b0; | ||||
| 			res1_7 += a1*b1; | ||||
| 
 | ||||
| 			ptrba = ptrba+8; | ||||
| 			ptrbb = ptrbb+2; | ||||
|                 } | ||||
| 
 | ||||
| 		res0_0 *= alpha; | ||||
| 		res0_1 *= alpha; | ||||
| 		res0_2 *= alpha; | ||||
| 		res0_3 *= alpha; | ||||
| 		res0_4 *= alpha; | ||||
| 		res0_5 *= alpha; | ||||
| 		res0_6 *= alpha; | ||||
| 		res0_7 *= alpha; | ||||
| 
 | ||||
| 		res1_0 *= alpha; | ||||
| 		res1_1 *= alpha; | ||||
| 		res1_2 *= alpha; | ||||
| 		res1_3 *= alpha; | ||||
| 		res1_4 *= alpha; | ||||
| 		res1_5 *= alpha; | ||||
| 		res1_6 *= alpha; | ||||
| 		res1_7 *= alpha; | ||||
| 
 | ||||
| 		C0[0] = res0_0; | ||||
| 		C0[1] = res0_1; | ||||
| 		C0[2] = res0_2; | ||||
| 		C0[3] = res0_3; | ||||
| 		C0[4] = res0_4; | ||||
| 		C0[5] = res0_5; | ||||
| 		C0[6] = res0_6; | ||||
| 		C0[7] = res0_7; | ||||
| 
 | ||||
| 		C1[0] = res1_0; | ||||
| 		C1[1] = res1_1; | ||||
| 		C1[2] = res1_2; | ||||
| 		C1[3] = res1_3; | ||||
| 		C1[4] = res1_4; | ||||
| 		C1[5] = res1_5; | ||||
| 		C1[6] = res1_6; | ||||
| 		C1[7] = res1_7; | ||||
| 
 | ||||
| 
 | ||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))  | ||||
| 		temp = bk - off; | ||||
| #ifdef LEFT | ||||
| 		temp -= 8; // number of values in A
 | ||||
| #else  | ||||
| 		temp -= 2; // number of values in B
 | ||||
| #endif | ||||
| 		ptrba += temp*8; | ||||
| 		ptrbb += temp*2; | ||||
| #endif | ||||
| 
 | ||||
| #ifdef LEFT | ||||
| 		off += 8; // number of values in A
 | ||||
| #endif | ||||
| 
 | ||||
| 		C0 = C0+8; | ||||
| 		C1 = C1+8; | ||||
| 	} | ||||
| 
 | ||||
| 	if ( bm & 4 ) | ||||
| 	{ | ||||
| 
 | ||||
| #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| 		ptrbb = bb; | ||||
| #else | ||||
| 		ptrba += off*4; | ||||
| 		ptrbb = bb + off*2; | ||||
| #endif | ||||
| 
 | ||||
| 		res0_0 = 0; | ||||
| 		res0_1 = 0; | ||||
| 		res0_2 = 0; | ||||
| 		res0_3 = 0; | ||||
| 
 | ||||
| 		res1_0 = 0; | ||||
| 		res1_1 = 0; | ||||
| 		res1_2 = 0; | ||||
| 		res1_3 = 0; | ||||
| 
 | ||||
| 
 | ||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | ||||
| 		temp = bk-off; | ||||
| #elif defined(LEFT)  | ||||
| 		temp = off+4;	// number of values in A
 | ||||
| #else | ||||
| 		temp = off+2;	// number of values in B
 | ||||
| #endif | ||||
| 
 | ||||
| 		for (k=0; k<temp; k++)  | ||||
|                 { | ||||
| 			b0 = ptrbb[0]; | ||||
| 			b1 = ptrbb[1]; | ||||
| 
 | ||||
| 			a0 = ptrba[0]; | ||||
| 			res0_0 += a0*b0; | ||||
| 			res1_0 += a0*b1; | ||||
| 
 | ||||
| 			a1 = ptrba[1]; | ||||
| 			res0_1 += a1*b0; | ||||
| 			res1_1 += a1*b1; | ||||
| 
 | ||||
| 			a0 = ptrba[2]; | ||||
| 			res0_2 += a0*b0; | ||||
| 			res1_2 += a0*b1; | ||||
| 
 | ||||
| 			a1 = ptrba[3]; | ||||
| 			res0_3 += a1*b0; | ||||
| 			res1_3 += a1*b1; | ||||
| 
 | ||||
| 			ptrba = ptrba+4; | ||||
| 			ptrbb = ptrbb+2; | ||||
|                 } | ||||
| 
 | ||||
| 		res0_0 *= alpha; | ||||
| 		res0_1 *= alpha; | ||||
| 		res0_2 *= alpha; | ||||
| 		res0_3 *= alpha; | ||||
| 
 | ||||
| 		res1_0 *= alpha; | ||||
| 		res1_1 *= alpha; | ||||
| 		res1_2 *= alpha; | ||||
| 		res1_3 *= alpha; | ||||
| 
 | ||||
| 		C0[0] = res0_0; | ||||
| 		C0[1] = res0_1; | ||||
| 		C0[2] = res0_2; | ||||
| 		C0[3] = res0_3; | ||||
| 
 | ||||
| 		C1[0] = res1_0; | ||||
| 		C1[1] = res1_1; | ||||
| 		C1[2] = res1_2; | ||||
| 		C1[3] = res1_3; | ||||
| 
 | ||||
| 
 | ||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))  | ||||
| 		temp = bk - off; | ||||
| #ifdef LEFT | ||||
| 		temp -= 4; // number of values in A
 | ||||
| #else  | ||||
| 		temp -= 2; // number of values in B
 | ||||
| #endif | ||||
| 		ptrba += temp*4; | ||||
| 		ptrbb += temp*2; | ||||
| #endif | ||||
| 
 | ||||
| #ifdef LEFT | ||||
| 		off += 4; // number of values in A
 | ||||
| #endif | ||||
| 
 | ||||
| 		C0 = C0+4; | ||||
| 		C1 = C1+4; | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 	if ( bm & 2 ) | ||||
| 	{ | ||||
| 
 | ||||
| #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| 		ptrbb = bb; | ||||
| #else | ||||
| 		ptrba += off*2; | ||||
| 		ptrbb = bb + off*2; | ||||
| #endif | ||||
| 
 | ||||
| 		res0_0 = 0; | ||||
| 		res0_1 = 0; | ||||
| 
 | ||||
| 		res1_0 = 0; | ||||
| 		res1_1 = 0; | ||||
| 
 | ||||
| 
 | ||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | ||||
| 		temp = bk-off; | ||||
| #elif defined(LEFT)  | ||||
| 		temp = off+2;	// number of values in A
 | ||||
| #else | ||||
| 		temp = off+2;	// number of values in B
 | ||||
| #endif | ||||
| 
 | ||||
| 		for (k=0; k<temp; k++)  | ||||
|                 { | ||||
| 			b0 = ptrbb[0]; | ||||
| 			b1 = ptrbb[1]; | ||||
| 
 | ||||
| 			a0 = ptrba[0]; | ||||
| 			res0_0 += a0*b0; | ||||
| 			res1_0 += a0*b1; | ||||
| 
 | ||||
| 			a1 = ptrba[1]; | ||||
| 			res0_1 += a1*b0; | ||||
| 			res1_1 += a1*b1; | ||||
| 
 | ||||
| 			ptrba = ptrba+2; | ||||
| 			ptrbb = ptrbb+2; | ||||
|                 } | ||||
| 
 | ||||
| 		res0_0 *= alpha; | ||||
| 		res0_1 *= alpha; | ||||
| 
 | ||||
| 		res1_0 *= alpha; | ||||
| 		res1_1 *= alpha; | ||||
| 
 | ||||
| 		C0[0] = res0_0; | ||||
| 		C0[1] = res0_1; | ||||
| 
 | ||||
| 		C1[0] = res1_0; | ||||
| 		C1[1] = res1_1; | ||||
| 
 | ||||
| 
 | ||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))  | ||||
| 		temp = bk - off; | ||||
| #ifdef LEFT | ||||
| 		temp -= 2; // number of values in A
 | ||||
| #else  | ||||
| 		temp -= 2; // number of values in B
 | ||||
| #endif | ||||
| 		ptrba += temp*2; | ||||
| 		ptrbb += temp*2; | ||||
| #endif | ||||
| 
 | ||||
| #ifdef LEFT | ||||
| 		off += 2; // number of values in A
 | ||||
| #endif | ||||
| 
 | ||||
| 		C0 = C0+2; | ||||
| 		C1 = C1+2; | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 	if ( bm & 1 ) | ||||
| 	{ | ||||
| 
 | ||||
| #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| 		ptrbb = bb; | ||||
| #else | ||||
| 		ptrba += off*1; | ||||
| 		ptrbb = bb + off*2; | ||||
| #endif | ||||
| 
 | ||||
| 		res0_0 = 0; | ||||
| 
 | ||||
| 		res1_0 = 0; | ||||
| 
 | ||||
| 
 | ||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | ||||
| 		temp = bk-off; | ||||
| #elif defined(LEFT)  | ||||
| 		temp = off+1;	// number of values in A
 | ||||
| #else | ||||
| 		temp = off+2;	// number of values in B
 | ||||
| #endif | ||||
| 
 | ||||
| 		for (k=0; k<temp; k++)  | ||||
|                 { | ||||
| 			b0 = ptrbb[0]; | ||||
| 			b1 = ptrbb[1]; | ||||
| 
 | ||||
| 			a0 = ptrba[0]; | ||||
| 			res0_0 += a0*b0; | ||||
| 			res1_0 += a0*b1; | ||||
| 
 | ||||
| 			ptrba = ptrba+1; | ||||
| 			ptrbb = ptrbb+2; | ||||
|                 } | ||||
| 
 | ||||
| 		res0_0 *= alpha; | ||||
| 
 | ||||
| 		res1_0 *= alpha; | ||||
| 
 | ||||
| 		C0[0] = res0_0; | ||||
| 
 | ||||
| 		C1[0] = res1_0; | ||||
| 
 | ||||
| 
 | ||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))  | ||||
| 		temp = bk - off; | ||||
| #ifdef LEFT | ||||
| 		temp -= 1; // number of values in A
 | ||||
| #else  | ||||
| 		temp -= 2; // number of values in B
 | ||||
| #endif | ||||
| 		ptrba += temp*1; | ||||
| 		ptrbb += temp*2; | ||||
| #endif | ||||
| 
 | ||||
| #ifdef LEFT | ||||
| 		off += 1; // number of values in A
 | ||||
| #endif | ||||
| 
 | ||||
| 		C0 = C0+1; | ||||
| 		C1 = C1+1; | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 
 | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| 		off += 2; | ||||
| #endif | ||||
| 
 | ||||
|         k = (bk<<1); | ||||
|         bb = bb+k; | ||||
|         i = (ldc<<1); | ||||
|         C = C+i; | ||||
|     } | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
|    for (j=0; j<(bn&1); j+=1)  | ||||
|    { | ||||
|         C0 = C; | ||||
| 
 | ||||
| #if defined(TRMMKERNEL) &&  defined(LEFT) | ||||
| 	off = offset; | ||||
| #endif | ||||
| 
 | ||||
|         ptrba = ba; | ||||
| 
 | ||||
|         for (i=0; i<bm/8; i+=1)  | ||||
|         { | ||||
| 
 | ||||
| #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| 		ptrbb = bb; | ||||
| #else | ||||
| 		ptrba += off*8; | ||||
| 		ptrbb = bb + off*1; | ||||
| #endif | ||||
| 
 | ||||
| 		res0_0 = 0; | ||||
| 		res0_1 = 0; | ||||
| 		res0_2 = 0; | ||||
| 		res0_3 = 0; | ||||
| 		res0_4 = 0; | ||||
| 		res0_5 = 0; | ||||
| 		res0_6 = 0; | ||||
| 		res0_7 = 0; | ||||
| 
 | ||||
| 
 | ||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | ||||
| 		temp = bk-off; | ||||
| #elif defined(LEFT)  | ||||
| 		temp = off+8;	// number of values in A
 | ||||
| #else | ||||
| 		temp = off+1;	// number of values in B
 | ||||
| #endif | ||||
| 
 | ||||
| 		for (k=0; k<temp; k++)  | ||||
|                 { | ||||
| 			b0 = ptrbb[0]; | ||||
| 
 | ||||
| 			a0 = ptrba[0]; | ||||
| 			res0_0 += a0*b0; | ||||
| 
 | ||||
| 			a1 = ptrba[1]; | ||||
| 			res0_1 += a1*b0; | ||||
| 
 | ||||
| 			a0 = ptrba[2]; | ||||
| 			res0_2 += a0*b0; | ||||
| 
 | ||||
| 			a1 = ptrba[3]; | ||||
| 			res0_3 += a1*b0; | ||||
| 
 | ||||
| 			a0 = ptrba[4]; | ||||
| 			res0_4 += a0*b0; | ||||
| 
 | ||||
| 			a1 = ptrba[5]; | ||||
| 			res0_5 += a1*b0; | ||||
| 
 | ||||
| 			a0 = ptrba[6]; | ||||
| 			res0_6 += a0*b0; | ||||
| 
 | ||||
| 			a1 = ptrba[7]; | ||||
| 			res0_7 += a1*b0; | ||||
| 
 | ||||
| 			ptrba = ptrba+8; | ||||
| 			ptrbb = ptrbb+1; | ||||
|                 } | ||||
| 
 | ||||
| 		res0_0 *= alpha; | ||||
| 		res0_1 *= alpha; | ||||
| 		res0_2 *= alpha; | ||||
| 		res0_3 *= alpha; | ||||
| 		res0_4 *= alpha; | ||||
| 		res0_5 *= alpha; | ||||
| 		res0_6 *= alpha; | ||||
| 		res0_7 *= alpha; | ||||
| 
 | ||||
| 		C0[0] = res0_0; | ||||
| 		C0[1] = res0_1; | ||||
| 		C0[2] = res0_2; | ||||
| 		C0[3] = res0_3; | ||||
| 		C0[4] = res0_4; | ||||
| 		C0[5] = res0_5; | ||||
| 		C0[6] = res0_6; | ||||
| 		C0[7] = res0_7; | ||||
| 
 | ||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))  | ||||
| 		temp = bk - off; | ||||
| #ifdef LEFT | ||||
| 		temp -= 8; // number of values in A
 | ||||
| #else  | ||||
| 		temp -= 1; // number of values in B
 | ||||
| #endif | ||||
| 		ptrba += temp*8; | ||||
| 		ptrbb += temp*1; | ||||
| #endif | ||||
| 
 | ||||
| #ifdef LEFT | ||||
| 		off += 8; // number of values in A
 | ||||
| #endif | ||||
| 
 | ||||
| 		C0 = C0+8; | ||||
| 	} | ||||
| 
 | ||||
| 	if ( bm & 4 ) | ||||
| 	{ | ||||
| 
 | ||||
| #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| 		ptrbb = bb; | ||||
| #else | ||||
| 		ptrba += off*4; | ||||
| 		ptrbb = bb + off*1; | ||||
| #endif | ||||
| 
 | ||||
| 		res0_0 = 0; | ||||
| 		res0_1 = 0; | ||||
| 		res0_2 = 0; | ||||
| 		res0_3 = 0; | ||||
| 
 | ||||
| 
 | ||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | ||||
| 		temp = bk-off; | ||||
| #elif defined(LEFT)  | ||||
| 		temp = off+4;	// number of values in A
 | ||||
| #else | ||||
| 		temp = off+1;	// number of values in B
 | ||||
| #endif | ||||
| 
 | ||||
| 		for (k=0; k<temp; k++)  | ||||
|                 { | ||||
| 			b0 = ptrbb[0]; | ||||
| 
 | ||||
| 			a0 = ptrba[0]; | ||||
| 			res0_0 += a0*b0; | ||||
| 
 | ||||
| 			a1 = ptrba[1]; | ||||
| 			res0_1 += a1*b0; | ||||
| 
 | ||||
| 			a0 = ptrba[2]; | ||||
| 			res0_2 += a0*b0; | ||||
| 
 | ||||
| 			a1 = ptrba[3]; | ||||
| 			res0_3 += a1*b0; | ||||
| 
 | ||||
| 			ptrba = ptrba+4; | ||||
| 			ptrbb = ptrbb+1; | ||||
|                 } | ||||
| 
 | ||||
| 		res0_0 *= alpha; | ||||
| 		res0_1 *= alpha; | ||||
| 		res0_2 *= alpha; | ||||
| 		res0_3 *= alpha; | ||||
| 
 | ||||
| 		C0[0] = res0_0; | ||||
| 		C0[1] = res0_1; | ||||
| 		C0[2] = res0_2; | ||||
| 		C0[3] = res0_3; | ||||
| 
 | ||||
| 
 | ||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))  | ||||
| 		temp = bk - off; | ||||
| #ifdef LEFT | ||||
| 		temp -= 4; // number of values in A
 | ||||
| #else  | ||||
| 		temp -= 1; // number of values in B
 | ||||
| #endif | ||||
| 		ptrba += temp*4; | ||||
| 		ptrbb += temp*1; | ||||
| #endif | ||||
| 
 | ||||
| #ifdef LEFT | ||||
| 		off += 4; // number of values in A
 | ||||
| #endif | ||||
| 
 | ||||
| 		C0 = C0+4; | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 	if ( bm & 2 ) | ||||
| 	{ | ||||
| 
 | ||||
| #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| 		ptrbb = bb; | ||||
| #else | ||||
| 		ptrba += off*2; | ||||
| 		ptrbb = bb + off*1; | ||||
| #endif | ||||
| 
 | ||||
| 		res0_0 = 0; | ||||
| 		res0_1 = 0; | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | ||||
| 		temp = bk-off; | ||||
| #elif defined(LEFT)  | ||||
| 		temp = off+2;	// number of values in A
 | ||||
| #else | ||||
| 		temp = off+1;	// number of values in B
 | ||||
| #endif | ||||
| 
 | ||||
| 		for (k=0; k<temp; k++)  | ||||
|                 { | ||||
| 			b0 = ptrbb[0]; | ||||
| 
 | ||||
| 			a0 = ptrba[0]; | ||||
| 			res0_0 += a0*b0; | ||||
| 
 | ||||
| 			a1 = ptrba[1]; | ||||
| 			res0_1 += a1*b0; | ||||
| 
 | ||||
| 			ptrba = ptrba+2; | ||||
| 			ptrbb = ptrbb+1; | ||||
|                 } | ||||
| 
 | ||||
| 		res0_0 *= alpha; | ||||
| 		res0_1 *= alpha; | ||||
| 
 | ||||
| 		C0[0] = res0_0; | ||||
| 		C0[1] = res0_1; | ||||
| 
 | ||||
| 
 | ||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))  | ||||
| 		temp = bk - off; | ||||
| #ifdef LEFT | ||||
| 		temp -= 2; // number of values in A
 | ||||
| #else  | ||||
| 		temp -= 1; // number of values in B
 | ||||
| #endif | ||||
| 		ptrba += temp*2; | ||||
| 		ptrbb += temp*1; | ||||
| #endif | ||||
| 
 | ||||
| #ifdef LEFT | ||||
| 		off += 2; // number of values in A
 | ||||
| #endif | ||||
| 
 | ||||
| 		C0 = C0+2; | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 	if ( bm & 1 ) | ||||
| 	{ | ||||
| 
 | ||||
| #if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) | ||||
| 		ptrbb = bb; | ||||
| #else | ||||
| 		ptrba += off*1; | ||||
| 		ptrbb = bb + off*1; | ||||
| #endif | ||||
| 
 | ||||
| 		res0_0 = 0; | ||||
| 
 | ||||
| 
 | ||||
| #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) | ||||
| 		temp = bk-off; | ||||
| #elif defined(LEFT)  | ||||
| 		temp = off+1;	// number of values in A
 | ||||
| #else | ||||
| 		temp = off+1;	// number of values in B
 | ||||
| #endif | ||||
| 
 | ||||
| 		for (k=0; k<temp; k++)  | ||||
|                 { | ||||
| 			b0 = ptrbb[0]; | ||||
| 
 | ||||
| 			a0 = ptrba[0]; | ||||
| 			res0_0 += a0*b0; | ||||
| 
 | ||||
| 			ptrba = ptrba+1; | ||||
| 			ptrbb = ptrbb+1; | ||||
|                 } | ||||
| 
 | ||||
| 		res0_0 *= alpha; | ||||
| 
 | ||||
| 		C0[0] = res0_0; | ||||
| 
 | ||||
| 
 | ||||
| #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))  | ||||
| 		temp = bk - off; | ||||
| #ifdef LEFT | ||||
| 		temp -= 1; // number of values in A
 | ||||
| #else  | ||||
| 		temp -= 1; // number of values in B
 | ||||
| #endif | ||||
| 		ptrba += temp*1; | ||||
| 		ptrbb += temp*1; | ||||
| #endif | ||||
| 
 | ||||
| #ifdef LEFT | ||||
| 		off += 1; // number of values in A
 | ||||
| #endif | ||||
| 
 | ||||
| 		C0 = C0+1; | ||||
| 
 | ||||
| 	} | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
| 		off += 1; | ||||
| #endif | ||||
| 
 | ||||
|         k = (bk<<0); | ||||
|         bb = bb+k; | ||||
|         C = C+ldc; | ||||
|    } | ||||
|    return 0; | ||||
| } | ||||
|  | @ -12,6 +12,7 @@ SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c | |||
| SGEMMITCOPY    =  ../generic/gemm_tcopy_16.c | ||||
| SGEMMONCOPY    =  gemm_ncopy_2_bulldozer.S | ||||
| SGEMMOTCOPY    =  gemm_tcopy_2_bulldozer.S | ||||
| 
 | ||||
| SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)  | ||||
| SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)  | ||||
| SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX) | ||||
|  | @ -53,8 +54,8 @@ STRSMKERNEL_RN  =  ../generic/trsm_kernel_RN.c | |||
| STRSMKERNEL_RT  =  ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c | ||||
| DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c | ||||
| DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c | ||||
| DTRSMKERNEL_LT  = dtrsm_kernel_LT_8x2_bulldozer.S | ||||
| DTRSMKERNEL_RN  = dtrsm_kernel_RN_8x2_bulldozer.S | ||||
| DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c | ||||
| 
 | ||||
| CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c | ||||
|  |  | |||
|  | @ -1255,6 +1255,9 @@ | |||
| 
 | ||||
| 	 | ||||
| .L2_60: | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
|         addq    $2, KK | ||||
| #endif | ||||
| 
 | ||||
| 	decq	J			// j -- | ||||
| 	jg	.L2_01			// next 2 lines of N | ||||
|  |  | |||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -40,7 +40,6 @@ | |||
| #include "common.h" | ||||
| #include "l2param.h" | ||||
| 
 | ||||
| // #undef ALIGNED_ACCESS | ||||
| 
 | ||||
| #define A_PRE 256 | ||||
| 
 | ||||
|  | @ -111,11 +110,7 @@ | |||
| #define Y1	%rbp | ||||
| #define X1	%r15 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| #define MM	INCX | ||||
| #else | ||||
| #define MM	M | ||||
| #endif | ||||
| 
 | ||||
| #define ALPHA	%xmm15 | ||||
| 
 | ||||
|  | @ -216,23 +211,6 @@ | |||
| 
 | ||||
| 	movq	BUFFER, X1 | ||||
| 	 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	testq	$SIZE, A | ||||
| 	je	.L01 | ||||
| 
 | ||||
| 	vmovsd	(X), %xmm0 | ||||
| 	addq	INCX, X | ||||
| 
 | ||||
| 	vmovsd	%xmm0, 1 * SIZE(BUFFER) | ||||
| 	addq	$1 * SIZE, BUFFER | ||||
| 	addq	$2 * SIZE, X1 | ||||
| 	decq	M | ||||
| 	jle	.L10 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L01: | ||||
| #endif | ||||
| 
 | ||||
| 	movq	M,  I | ||||
| 	sarq	$3, I | ||||
| 	jle	.L05 | ||||
|  | @ -287,10 +265,6 @@ | |||
| .L10: | ||||
| 	movq	Y, Y1 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	testq	$SIZE, LDA | ||||
| 	jne	.L50 | ||||
| #endif | ||||
| 
 | ||||
| #if GEMV_UNROLL >= 8 | ||||
| 	cmpq	$8, N | ||||
|  | @ -316,41 +290,6 @@ | |||
| 	vxorps %xmm7 , 	%xmm7, %xmm7 | ||||
| 
 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	testq	$SIZE, A | ||||
| 	je	.L1X | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(X1), %xmm12 | ||||
| 
 | ||||
|         vmovsd   -16 * SIZE(A1), %xmm8 | ||||
|         vmovsd   -16 * SIZE(A1, LDA), %xmm9 | ||||
|         vmovsd   -16 * SIZE(A1, LDA, 2), %xmm10 | ||||
|         vmovsd   -16 * SIZE(A1, LDA3), %xmm11 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 | ||||
| 	vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 | ||||
| 	vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 | ||||
| 	vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 | ||||
| 
 | ||||
|         vmovsd   -16 * SIZE(A2), %xmm8 | ||||
|         vmovsd   -16 * SIZE(A2, LDA), %xmm9 | ||||
|         vmovsd   -16 * SIZE(A2, LDA, 2), %xmm10 | ||||
|         vmovsd   -16 * SIZE(A2, LDA3), %xmm11 | ||||
| 
 | ||||
| 	vfmaddpd %xmm4, %xmm8 , %xmm12, %xmm4 | ||||
| 	vfmaddpd %xmm5, %xmm9 , %xmm12, %xmm5 | ||||
| 	vfmaddpd %xmm6, %xmm10, %xmm12, %xmm6 | ||||
| 	vfmaddpd %xmm7, %xmm11, %xmm12, %xmm7 | ||||
| 
 | ||||
| 
 | ||||
| 	addq	 $SIZE, A1 | ||||
| 	addq	 $SIZE, A2 | ||||
| 	addq	 $SIZE, X1 | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L1X: | ||||
| #endif | ||||
| 
 | ||||
| 	movq	M,  I | ||||
| 	sarq	$3,  I | ||||
| 	jle	.L15 | ||||
|  | @ -671,31 +610,6 @@ | |||
| 	vxorps %xmm3 , 	%xmm3, %xmm3 | ||||
| 
 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	testq	$SIZE, A | ||||
| 	je	.L2X | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(X1), %xmm12 | ||||
| 
 | ||||
| 	vmovsd   -16 * SIZE(A1), %xmm8 | ||||
|         vmovsd   -16 * SIZE(A1, LDA), %xmm9 | ||||
| 	vmovsd   -16 * SIZE(A2), %xmm10 | ||||
|         vmovsd   -16 * SIZE(A2, LDA), %xmm11 | ||||
| 
 | ||||
| 
 | ||||
|         vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 | ||||
|         vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 | ||||
|         vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2 | ||||
|         vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3 | ||||
| 
 | ||||
| 	addq	 $SIZE, A1 | ||||
| 	addq	 $SIZE, A2 | ||||
| 	addq	 $SIZE, X1 | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L2X: | ||||
| #endif | ||||
| 
 | ||||
| 	movq	M,  I | ||||
| 	sarq	$3,  I | ||||
| 	jle	.L25 | ||||
|  | @ -924,26 +838,6 @@ | |||
|         vxorps %xmm3 ,  %xmm3, %xmm3 | ||||
| 
 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	testq	$SIZE, A | ||||
| 	je	.L3X | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(X1), %xmm12 | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(A1), %xmm8 | ||||
| 	vmovsd	-16 * SIZE(A2), %xmm9 | ||||
| 
 | ||||
|         vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 | ||||
|         vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1 | ||||
| 
 | ||||
| 	addq	 $SIZE, A1 | ||||
| 	addq	 $SIZE, A2 | ||||
| 	addq	 $SIZE, X1 | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L3X: | ||||
| #endif | ||||
| 
 | ||||
| 	movq	M,  I | ||||
| 	sarq	$3,  I | ||||
| 	jle	.L35 | ||||
|  | @ -1100,21 +994,6 @@ | |||
|         vxorps %xmm3 ,  %xmm3, %xmm3 | ||||
| 
 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	testq	$SIZE, A | ||||
| 	je	.L4X | ||||
| 
 | ||||
| 	movsd	-16 * SIZE(X1), %xmm12 | ||||
| 	movsd	-16 * SIZE(A1), %xmm8 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0 | ||||
| 
 | ||||
| 	addq	 $SIZE, A1 | ||||
| 	addq	 $SIZE, X1 | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L4X: | ||||
| #endif | ||||
| 
 | ||||
| 	movq	M,  I | ||||
| 	sarq	$3,  I | ||||
|  | @ -1223,683 +1102,6 @@ | |||
| 	vmovlpd	%xmm0, (Y1) | ||||
| 	addq	INCY, Y1 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	jmp	.L999 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L50: | ||||
| #if GEMV_UNROLL >= 4 | ||||
| 
 | ||||
| 	cmpq	$4, N | ||||
| 	jl	.L60 | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L51: | ||||
| 	subq	$4, N | ||||
| 
 | ||||
| 	leaq	16 * SIZE(BUFFER), X1 | ||||
| 
 | ||||
| 	movq	A, A1 | ||||
| 	leaq	(A1, LDA, 2), A2 | ||||
| 	leaq	(A1, LDA, 4), A | ||||
| 
 | ||||
|         vxorps %xmm0 ,  %xmm0, %xmm0 | ||||
|         vxorps %xmm1 ,  %xmm1, %xmm1 | ||||
|         vxorps %xmm2 ,  %xmm2, %xmm2 | ||||
|         vxorps %xmm3 ,  %xmm3, %xmm3 | ||||
| 
 | ||||
| 
 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	testq	$SIZE, A | ||||
| 	je	.L5X | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(X1), %xmm12 | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(A1), %xmm4 | ||||
| 	vmovsd	-16 * SIZE(A1, LDA), %xmm5 | ||||
| 	vmovsd	-16 * SIZE(A2), %xmm6 | ||||
| 	vmovsd	-16 * SIZE(A2, LDA), %xmm7 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0, %xmm4 , %xmm12, %xmm0 | ||||
|         vfmaddpd %xmm1, %xmm5 , %xmm12, %xmm1 | ||||
|         vfmaddpd %xmm2, %xmm6 , %xmm12, %xmm2 | ||||
|         vfmaddpd %xmm3, %xmm7 , %xmm12, %xmm3 | ||||
| 
 | ||||
| 
 | ||||
| 	addq	 $SIZE, A1 | ||||
| 	addq	 $SIZE, A2 | ||||
| 	addq	 $SIZE, X1 | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L5X: | ||||
| #endif | ||||
| 
 | ||||
|         vxorps %xmm8 ,  %xmm8, %xmm8 | ||||
|         vxorps %xmm9 ,  %xmm9, %xmm9 | ||||
| 	vmovhpd	-16 * SIZE(A1, LDA), %xmm8 , %xmm8 | ||||
| 	vmovhpd	-16 * SIZE(A2, LDA), %xmm9 , %xmm9 | ||||
| 
 | ||||
| 	movq	M,  I | ||||
| 	sarq	$3,  I | ||||
| 	jle	.L55 | ||||
| 
 | ||||
| 	VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) | ||||
| 	VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) | ||||
| 
 | ||||
| 	decq	I | ||||
| 	jle	.L53 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L52: | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 	VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -16 * SIZE(A2)           , %xmm12 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm7, %xmm9 , %xmm9 | ||||
| 	vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 | ||||
| 	VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -14 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	VMOVUPS_A1(-12 * SIZE, A1, %xmm4) | ||||
| 	vshufpd	$1, %xmm8, %xmm5 , %xmm5 | ||||
| 	vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 | ||||
| 	VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -14 * SIZE(A2)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm9, %xmm7 , %xmm7 | ||||
| 	vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 | ||||
| 	VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) | ||||
| 	VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -12 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 	VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -12 * SIZE(A2)           , %xmm12 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm7, %xmm9 , %xmm9 | ||||
| 	vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 | ||||
| 	VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -10 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm8, %xmm5 , %xmm5 | ||||
| 	vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 | ||||
| 	VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5) | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -10 * SIZE(A2)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm9, %xmm7 , %xmm7 | ||||
| 	vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 | ||||
| 	VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) | ||||
| 	VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7) | ||||
| 
 | ||||
| 	addq	$8 * SIZE, A1 | ||||
| 	addq	$8 * SIZE, A2 | ||||
| 	addq	$8 * SIZE, X1 | ||||
| 
 | ||||
| 	decq	I | ||||
| 	jg	.L52 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L53: | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 	VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -16 * SIZE(A2)           , %xmm12 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm7, %xmm9 , %xmm9 | ||||
| 	vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 | ||||
| 	VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -14 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm8, %xmm5 , %xmm5 | ||||
| 	vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 | ||||
| 	VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5) | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -14 * SIZE(A2)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm9, %xmm7 , %xmm7 | ||||
| 	vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 | ||||
| 	VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) | ||||
| 	VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -12 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 	VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8) | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -12 * SIZE(A2)           , %xmm12 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm7, %xmm9 , %xmm9 | ||||
| 	vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 | ||||
| 	VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -10 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm8, %xmm5 , %xmm5 | ||||
| 	vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 | ||||
| 	VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5) | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -10 * SIZE(A2)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm9, %xmm7 , %xmm7 | ||||
| 	vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 | ||||
| 	VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) | ||||
| 	VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7) | ||||
| 
 | ||||
| 	addq	$8 * SIZE, A1 | ||||
| 	addq	$8 * SIZE, A2 | ||||
| 	addq	$8 * SIZE, X1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L55: | ||||
| 	testq	$4, M | ||||
| 	jle	.L56 | ||||
| 
 | ||||
| 	VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) | ||||
| 	VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) | ||||
| 
 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 	VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8) | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -16 * SIZE(A2)           , %xmm12 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm7, %xmm9 , %xmm9 | ||||
| 	vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 | ||||
| 	VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -14 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm8, %xmm5 , %xmm5 | ||||
| 	vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -14 * SIZE(A2)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm9, %xmm7 , %xmm7 | ||||
| 	vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3 | ||||
| 
 | ||||
| 
 | ||||
| 	addq	$4 * SIZE, A1 | ||||
| 	addq	$4 * SIZE, A2 | ||||
| 	addq	$4 * SIZE, X1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L56: | ||||
| 	testq	$2, M | ||||
| 	jle	.L57 | ||||
| 
 | ||||
| 	VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5) | ||||
| 	VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7) | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) | ||||
| 
 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , -16 * SIZE(A2)           , %xmm12 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm7, %xmm9 , %xmm9 | ||||
| 	vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 | ||||
| 
 | ||||
| 	addq	$2 * SIZE, A1 | ||||
| 	addq	$2 * SIZE, A2 | ||||
| 	addq	$2 * SIZE, X1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L57: | ||||
| 	testq	$1, M | ||||
| 	je	.L58 | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(X1), %xmm12 | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(A1), %xmm4 | ||||
| 	vmovsd	-16 * SIZE(A2), %xmm6 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm8, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 
 | ||||
| 	vfmaddpd %xmm2 , %xmm6 , %xmm12 , %xmm2 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm9, %xmm9 , %xmm9 | ||||
| 	vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3 | ||||
| 
 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L58: | ||||
| 	vhaddpd	%xmm1, %xmm0 , %xmm0 | ||||
| 	vhaddpd	%xmm3, %xmm2 , %xmm2 | ||||
| 
 | ||||
| 	vmulpd	ALPHA, %xmm0 , %xmm0 | ||||
| 	vmulpd	ALPHA, %xmm2 , %xmm2 | ||||
| 
 | ||||
| 	cmpq	$SIZE, INCY | ||||
| 	jne	.L59 | ||||
| 
 | ||||
| 	vmovups  0 * SIZE(Y), %xmm4 | ||||
| 	vmovups  2 * SIZE(Y), %xmm5 | ||||
| 	addq	$4 * SIZE, Y | ||||
| 
 | ||||
| 	vaddpd	%xmm4, %xmm0 , %xmm0 | ||||
| 	vaddpd	%xmm5, %xmm2 , %xmm2 | ||||
| 
 | ||||
| 	vmovups	%xmm0,  0 * SIZE(Y1) | ||||
| 	vmovups	%xmm2,  2 * SIZE(Y1) | ||||
| 	addq	$4 * SIZE, Y1 | ||||
| 
 | ||||
| 	cmpq	$4, N | ||||
| 	jge	.L51 | ||||
| 	jmp	.L60 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L59: | ||||
| 	vmovsd	(Y), %xmm4 | ||||
| 	addq	INCY, Y | ||||
| 	vmovhpd	(Y), %xmm4 , %xmm4 | ||||
| 	addq	INCY, Y | ||||
| 	vmovsd	(Y), %xmm5 | ||||
| 	addq	INCY, Y | ||||
| 	vmovhpd	(Y), %xmm5 , %xmm5 | ||||
| 	addq	INCY, Y | ||||
| 
 | ||||
| 	vaddpd	%xmm4, %xmm0 , %xmm0 | ||||
| 	vaddpd	%xmm5, %xmm2 , %xmm2 | ||||
| 
 | ||||
| 	vmovlpd	%xmm0, (Y1) | ||||
| 	addq	INCY, Y1 | ||||
| 	vmovhpd	%xmm0, (Y1) | ||||
| 	addq	INCY, Y1 | ||||
| 	vmovlpd	%xmm2, (Y1) | ||||
| 	addq	INCY, Y1 | ||||
| 	vmovhpd	%xmm2, (Y1) | ||||
| 	addq	INCY, Y1 | ||||
| 	cmpq	$4, N | ||||
| 	jge	.L51 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L60: | ||||
| #endif | ||||
| 
 | ||||
| #if GEMV_UNROLL >= 2 | ||||
| 
 | ||||
| 	cmpq	$2, N | ||||
| 	jl	.L70 | ||||
| 
 | ||||
| #if GEMV_UNROLL == 2 | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L61: | ||||
| #endif | ||||
| 
 | ||||
| 	subq	$2, N | ||||
| 
 | ||||
| 	leaq	16 * SIZE(BUFFER), X1 | ||||
| 
 | ||||
| 	movq	A, A1 | ||||
| 	leaq	(A1, LDA), A2 | ||||
| 	leaq	(A1, LDA, 2), A | ||||
| 
 | ||||
|         vxorps %xmm0 ,  %xmm0, %xmm0 | ||||
|         vxorps %xmm1 ,  %xmm1, %xmm1 | ||||
|         vxorps %xmm2 ,  %xmm2, %xmm2 | ||||
|         vxorps %xmm3 ,  %xmm3, %xmm3 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	testq	$SIZE, A | ||||
| 	je	.L6X | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(X1), %xmm12 | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(A1), %xmm4 | ||||
| 	vmovsd	-16 * SIZE(A2), %xmm5 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 | ||||
| 	vfmaddpd %xmm1 , %xmm5 , %xmm12 , %xmm1 | ||||
| 
 | ||||
| 	addq	 $SIZE, A1 | ||||
| 	addq	 $SIZE, A2 | ||||
| 	addq	 $SIZE, X1 | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L6X: | ||||
| #endif | ||||
| 
 | ||||
|         vxorps %xmm8 ,  %xmm8, %xmm8 | ||||
| 	vmovhpd	-16 * SIZE(A2), %xmm8 , %xmm8 | ||||
| 
 | ||||
| 	movq	M,  I | ||||
| 	sarq	$3,  I | ||||
| 	jle	.L65 | ||||
| 
 | ||||
| 	VMOVUPS_A1(-15 * SIZE, A2, %xmm5) | ||||
| 	VMOVUPS_A1(-13 * SIZE, A2, %xmm7) | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) | ||||
| 
 | ||||
| 	decq	I | ||||
| 	jle	.L63 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L62: | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 	VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_A1(-11 * SIZE, A2, %xmm9) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -14 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm7, %xmm5 , %xmm5 | ||||
| 	vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 | ||||
| 	VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) | ||||
| 	VMOVUPS_A1( -9 * SIZE, A2, %xmm8) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -12 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm9, %xmm7 , %xmm7 | ||||
| 	vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1 | ||||
| 	VMOVUPS_XL1(-8 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_A1(-7 * SIZE, A2, %xmm5) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -10 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm8, %xmm9 , %xmm9 | ||||
| 	vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1 | ||||
| 	VMOVUPS_XL1(-6 * SIZE, X1, %xmm13) | ||||
| 	VMOVUPS_A1(-5 * SIZE, A2, %xmm7) | ||||
| 
 | ||||
| 	addq	$8 * SIZE, A1 | ||||
| 	addq	$8 * SIZE, A2 | ||||
| 	addq	$8 * SIZE, X1 | ||||
| 
 | ||||
| 	decq	I | ||||
| 	jg	.L62 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L63: | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 	VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_A1(-11 * SIZE, A2, %xmm9) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -14 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm7, %xmm5 , %xmm5 | ||||
| 	vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 | ||||
| 	VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) | ||||
| 	VMOVUPS_A1( -9 * SIZE, A2, %xmm8) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -12 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm9, %xmm7 , %xmm7 | ||||
| 	vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -10 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm8, %xmm9 , %xmm9 | ||||
| 	vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1 | ||||
| 
 | ||||
| 
 | ||||
| 	addq	$8 * SIZE, A1 | ||||
| 	addq	$8 * SIZE, A2 | ||||
| 	addq	$8 * SIZE, X1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L65: | ||||
| 	testq	$4, M | ||||
| 	jle	.L66 | ||||
| 
 | ||||
| 	VMOVUPS_A1(-15 * SIZE, A2, %xmm5) | ||||
| 	VMOVUPS_A1(-13 * SIZE, A2, %xmm7) | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -14 * SIZE(A1)           , %xmm13 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm7, %xmm5 , %xmm5 | ||||
| 	vmovups	%xmm7, %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1 | ||||
| 
 | ||||
| 	addq	$4 * SIZE, A1 | ||||
| 	addq	$4 * SIZE, A2 | ||||
| 	addq	$4 * SIZE, X1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L66: | ||||
| 	testq	$2, M | ||||
| 	jle	.L67 | ||||
| 
 | ||||
| 	VMOVUPS_A1(-15 * SIZE, A2, %xmm5) | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	vshufpd	$1, %xmm5, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 	movaps	%xmm5, %xmm8 | ||||
| 
 | ||||
| 	addq	$2 * SIZE, A1 | ||||
| 	addq	$2 * SIZE, A2 | ||||
| 	addq	$2 * SIZE, X1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L67: | ||||
| 	testq	$1, M | ||||
| 	je	.L68 | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(X1), %xmm12 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 	vshufpd	$1, %xmm8, %xmm8 , %xmm8 | ||||
| 	vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L68: | ||||
| 	vaddpd	%xmm2, %xmm0 , %xmm0 | ||||
| 	vaddpd	%xmm3, %xmm1 , %xmm1 | ||||
| 
 | ||||
| 	vhaddpd	%xmm1, %xmm0 , %xmm0 | ||||
| 
 | ||||
| 	vmulpd	ALPHA, %xmm0 , %xmm0 | ||||
| 
 | ||||
| 	vmovsd	(Y), %xmm4 | ||||
| 	addq	INCY, Y | ||||
| 	vmovhpd	(Y), %xmm4 , %xmm4 | ||||
| 	addq	INCY, Y | ||||
| 
 | ||||
| 	vaddpd	%xmm4, %xmm0 , %xmm0 | ||||
| 
 | ||||
| 	vmovlpd	%xmm0, (Y1) | ||||
| 	addq	INCY, Y1 | ||||
| 	vmovhpd	%xmm0, (Y1) | ||||
| 	addq	INCY, Y1 | ||||
| 
 | ||||
| #if GEMV_UNROLL == 2 | ||||
| 	cmpq	$2, N | ||||
| 	jge	.L61 | ||||
| #endif | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L70: | ||||
| 	cmpq	$1, N | ||||
| 	jl	.L999 | ||||
| 
 | ||||
| #endif | ||||
| 
 | ||||
| 	leaq	16 * SIZE(BUFFER), X1 | ||||
| 
 | ||||
| 	movq	A, A1 | ||||
| 
 | ||||
|         vxorps %xmm0 ,  %xmm0, %xmm0 | ||||
|         vxorps %xmm1 ,  %xmm1, %xmm1 | ||||
|         vxorps %xmm2 ,  %xmm2, %xmm2 | ||||
|         vxorps %xmm3 ,  %xmm3, %xmm3 | ||||
| 
 | ||||
| #ifdef ALIGNED_ACCESS | ||||
| 	testq	$SIZE, A | ||||
| 	je	.L7X | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(X1), %xmm12 | ||||
| 	vmovsd	-16 * SIZE(A1), %xmm4 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	addq	 $SIZE, A1 | ||||
| 	addq	 $SIZE, X1 | ||||
| 	ALIGN_3 | ||||
| 
 | ||||
| .L7X: | ||||
| #endif | ||||
| 	movq	M,  I | ||||
| 	sarq	$3,  I | ||||
| 	jle	.L75 | ||||
| 
 | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) | ||||
| 
 | ||||
| 	decq	I | ||||
| 	jle	.L73 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L72: | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 	vfmaddpd %xmm2 , -14 * SIZE(A1)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -12 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 	vfmaddpd %xmm2 , -10 * SIZE(A1)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	VMOVUPS_XL1( -8 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_XL1( -6 * SIZE, X1, %xmm13) | ||||
| 
 | ||||
| 	addq	$8 * SIZE, A1 | ||||
| 	addq	$8 * SIZE, X1 | ||||
| 
 | ||||
| 	decq	I | ||||
| 	jg	.L72 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L73: | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 	vfmaddpd %xmm2 , -14 * SIZE(A1)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-12 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_XL1(-10 * SIZE, X1, %xmm13) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -12 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 	vfmaddpd %xmm2 , -10 * SIZE(A1)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	addq	$8 * SIZE, A1 | ||||
| 	addq	$8 * SIZE, X1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L75: | ||||
| 	testq	$4, M | ||||
| 	jle	.L76 | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) | ||||
| 	VMOVUPS_XL1(-14 * SIZE, X1, %xmm13) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 	vfmaddpd %xmm2 , -14 * SIZE(A1)           , %xmm13 , %xmm2 | ||||
| 
 | ||||
| 	addq	$4 * SIZE, A1 | ||||
| 	addq	$4 * SIZE, X1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L76: | ||||
| 	testq	$2, M | ||||
| 	jle	.L77 | ||||
| 
 | ||||
| 	VMOVUPS_XL1(-16 * SIZE, X1, %xmm12) | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , -16 * SIZE(A1)           , %xmm12 , %xmm0 | ||||
| 
 | ||||
| 	addq	$2 * SIZE, A1 | ||||
| 	addq	$2 * SIZE, X1 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L77: | ||||
| 	testq	$1, M | ||||
| 	je	.L78 | ||||
| 
 | ||||
| 	vmovsd	-16 * SIZE(X1), %xmm12 | ||||
| 	vmovsd	-16 * SIZE(A1), %xmm4 | ||||
| 
 | ||||
| 	vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0 | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L78: | ||||
| 	vaddpd	%xmm2, %xmm0 , %xmm0 | ||||
| 	vaddpd	%xmm3, %xmm1 , %xmm1 | ||||
| 
 | ||||
| 	vaddpd	%xmm1, %xmm0 , %xmm0 | ||||
| 
 | ||||
| 	vhaddpd	%xmm1, %xmm0 , %xmm0 | ||||
| 
 | ||||
| 	vmulsd	ALPHA, %xmm0 , %xmm0 | ||||
| 
 | ||||
| 	vmovsd	(Y), %xmm4 | ||||
| 	addq	INCY, Y | ||||
| 
 | ||||
| 	vaddsd	%xmm4, %xmm0 , %xmm0 | ||||
| 
 | ||||
| 	vmovlpd	%xmm0, (Y1) | ||||
| 	addq	INCY, Y1 | ||||
| #endif | ||||
| 	ALIGN_4 | ||||
| 
 | ||||
| .L999: | ||||
|  |  | |||
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							
										
											
												File diff suppressed because it is too large
												Load Diff
											
										
									
								
							|  | @ -949,6 +949,9 @@ | |||
| 
 | ||||
| 	 | ||||
| .L2_60: | ||||
| #if defined(TRMMKERNEL) && !defined(LEFT) | ||||
|         addq    $2, KK | ||||
| #endif | ||||
| 
 | ||||
| 	decq	J			// j -- | ||||
| 	jg	.L2_01			// next 2 lines of N | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue