Merge branch 'bulldozer' into develop

This commit is contained in:
Zhang Xianyi 2013-08-12 23:22:10 +08:00
commit c0b1e41bec
17 changed files with 9774 additions and 1930 deletions

View File

@ -324,16 +324,14 @@ ifeq ($(ARCH), x86)
DYNAMIC_CORE = KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS \
CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE
#BULLDOZER PILEDRIVER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
endif
endif
ifeq ($(ARCH), x86_64)
DYNAMIC_CORE = PRESCOTT CORE2 PENRYN DUNNINGTON NEHALEM OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO
ifneq ($(NO_AVX), 1)
DYNAMIC_CORE += SANDYBRIDGE
#BULLDOZER PILEDRIVER
DYNAMIC_CORE += SANDYBRIDGE BULLDOZER PILEDRIVER
endif
endif

View File

@ -105,8 +105,8 @@
#define CORE_NANO 19
#define CORE_SANDYBRIDGE 20
#define CORE_BOBCAT 21
#define CORE_BULLDOZER CORE_BARCELONA
#define CORE_PILEDRIVER CORE_BARCELONA
#define CORE_BULLDOZER 22
#define CORE_PILEDRIVER 23
#define CORE_HASWELL CORE_SANDYBRIDGE
#define HAVE_SSE (1 << 0)
@ -198,8 +198,8 @@ typedef struct {
#define CPUTYPE_NANO 43
#define CPUTYPE_SANDYBRIDGE 44
#define CPUTYPE_BOBCAT 45
#define CPUTYPE_BULLDOZER CPUTYPE_BARCELONA
#define CPUTYPE_PILEDRIVER CPUTYPE_BARCELONA
#define CPUTYPE_BULLDOZER 46
#define CPUTYPE_PILEDRIVER 47
// this define is because BLAS doesn't have haswell specific optimizations yet
#define CPUTYPE_HASWELL CPUTYPE_SANDYBRIDGE

View File

@ -63,16 +63,14 @@ extern gotoblas_t gotoblas_BARCELONA;
extern gotoblas_t gotoblas_BOBCAT;
#ifndef NO_AVX
extern gotoblas_t gotoblas_SANDYBRIDGE;
//extern gotoblas_t gotoblas_BULLDOZER;
//extern gotoblas_t gotoblas_PILEDRIVER;
extern gotoblas_t gotoblas_BULLDOZER;
extern gotoblas_t gotoblas_PILEDRIVER;
#else
//Use NEHALEM kernels for sandy bridge
#define gotoblas_SANDYBRIDGE gotoblas_NEHALEM
#endif
#define gotoblas_BULLDOZER gotoblas_BARCELONA
#define gotoblas_PILEDRIVER gotoblas_BARCELONA
#endif
//Use sandy bridge kernels for haswell.
#define gotoblas_HASWELL gotoblas_SANDYBRIDGE

View File

@ -494,7 +494,7 @@ static void disable_affinity(void) {
#ifndef USE_OPENMP
for(i=0; i< count; i++){
lprocmask[i] &= ((unsigned long *)&cpu_orig_mask[0])[i];
lprocmask[i] &= common->avail[i];
}
#endif
@ -754,7 +754,7 @@ void gotoblas_affinity_init(void) {
if (common -> num_nodes > 1) numa_mapping();
common -> final_num_procs = 0;
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += popcount(common -> avail[i]);
for(i = 0; i < common -> avail_count; i++) common -> final_num_procs += rcount(common -> avail[i]) + 1; //Make the max cpu number.
for (cpu = 0; cpu < common -> final_num_procs; cpu ++) common -> cpu_use[cpu] = 0;

View File

@ -354,7 +354,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "OPTERON"
#endif
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL) || defined (FORCE_PILEDRIVER) || defined (FORCE_BULLDOZER)
#if defined(FORCE_BARCELONA) || defined(FORCE_SHANGHAI) || defined(FORCE_ISTANBUL)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
@ -384,7 +384,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BOBCAT"
#endif
#if 0
#if defined (FORCE_BULLDOZER)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"
@ -400,7 +400,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CORENAME "BULLDOZER"
#endif
#if 0
#if defined (FORCE_PILEDRIVER)
#define FORCE
#define FORCE_INTEL
#define ARCHITECTURE "X86"

View File

@ -582,6 +582,24 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
ifdef STRMMKERNEL
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -595,17 +613,79 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
ifdef DTRMMKERNEL
ifdef DTRMMKERNEL_LN
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_LT
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RN
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RT
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
else
ifdef DTRMMKERNEL_LN
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_LT
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RN
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RT
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
endif
ifdef QTRMMKERNEL
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -619,6 +699,50 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
ifdef CTRMMKERNEL
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -643,6 +767,37 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
ifdef ZTRMMKERNEL
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -666,7 +821,37 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
endif
ifdef XTRMMKERNEL
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -692,6 +877,9 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,750 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0_0;
FLOAT res0_1;
FLOAT res0_2;
FLOAT res0_3;
FLOAT res0_4;
FLOAT res0_5;
FLOAT res0_6;
FLOAT res0_7;
FLOAT res1_0;
FLOAT res1_1;
FLOAT res1_2;
FLOAT res1_3;
FLOAT res1_4;
FLOAT res1_5;
FLOAT res1_6;
FLOAT res1_7;
FLOAT a0;
FLOAT a1;
FLOAT b0;
FLOAT b1;
BLASLONG off, temp;
#if !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/8; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*8;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res0_4 = 0;
res0_5 = 0;
res0_6 = 0;
res0_7 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
res1_4 = 0;
res1_5 = 0;
res1_6 = 0;
res1_7 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+8; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
a0 = ptrba[4];
res0_4 += a0*b0;
res1_4 += a0*b1;
a1 = ptrba[5];
res0_5 += a1*b0;
res1_5 += a1*b1;
a0 = ptrba[6];
res0_6 += a0*b0;
res1_6 += a0*b1;
a1 = ptrba[7];
res0_7 += a1*b0;
res1_7 += a1*b1;
ptrba = ptrba+8;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res0_4 *= alpha;
res0_5 *= alpha;
res0_6 *= alpha;
res0_7 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
res1_4 *= alpha;
res1_5 *= alpha;
res1_6 *= alpha;
res1_7 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
C0[4] = res0_4;
C0[5] = res0_5;
C0[6] = res0_6;
C0[7] = res0_7;
C1[0] = res1_0;
C1[1] = res1_1;
C1[2] = res1_2;
C1[3] = res1_3;
C1[4] = res1_4;
C1[5] = res1_5;
C1[6] = res1_6;
C1[7] = res1_7;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 8; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*8;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 8; // number of values in A
#endif
C0 = C0+8;
C1 = C1+8;
}
if ( bm & 4 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
C1[0] = res1_0;
C1[1] = res1_1;
C1[2] = res1_2;
C1[3] = res1_3;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*4;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
C0 = C0+4;
C1 = C1+4;
}
if ( bm & 2 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res1_0 = 0;
res1_1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C1[0] = res1_0;
C1[1] = res1_1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
C0 = C0+2;
C1 = C1+2;
}
if ( bm & 1 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res1_0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res1_0 *= alpha;
C0[0] = res0_0;
C1[0] = res1_0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*1;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
C0 = C0+1;
C1 = C1+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/8; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*8;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res0_4 = 0;
res0_5 = 0;
res0_6 = 0;
res0_7 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+8; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
a0 = ptrba[2];
res0_2 += a0*b0;
a1 = ptrba[3];
res0_3 += a1*b0;
a0 = ptrba[4];
res0_4 += a0*b0;
a1 = ptrba[5];
res0_5 += a1*b0;
a0 = ptrba[6];
res0_6 += a0*b0;
a1 = ptrba[7];
res0_7 += a1*b0;
ptrba = ptrba+8;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res0_4 *= alpha;
res0_5 *= alpha;
res0_6 *= alpha;
res0_7 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
C0[4] = res0_4;
C0[5] = res0_5;
C0[6] = res0_6;
C0[7] = res0_7;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 8; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*8;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 8; // number of values in A
#endif
C0 = C0+8;
}
if ( bm & 4 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
a0 = ptrba[2];
res0_2 += a0*b0;
a1 = ptrba[3];
res0_3 += a1*b0;
ptrba = ptrba+4;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*4;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
C0 = C0+4;
}
if ( bm & 2 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*2;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
C0 = C0+2;
}
if ( bm & 1 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
C0[0] = res0_0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*1;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
C0 = C0+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -12,6 +12,7 @@ SGEMMINCOPY = ../generic/gemm_ncopy_16.c
SGEMMITCOPY = ../generic/gemm_tcopy_16.c
SGEMMONCOPY = gemm_ncopy_2_bulldozer.S
SGEMMOTCOPY = gemm_tcopy_2_bulldozer.S
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
@ -53,8 +54,8 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c

View File

@ -1255,6 +1255,9 @@
.L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
#endif
decq J // j --
jg .L2_01 // next 2 lines of N

File diff suppressed because it is too large Load Diff

View File

@ -40,7 +40,6 @@
#include "common.h"
#include "l2param.h"
// #undef ALIGNED_ACCESS
#define A_PRE 256
@ -111,11 +110,7 @@
#define Y1 %rbp
#define X1 %r15
#ifdef ALIGNED_ACCESS
#define MM INCX
#else
#define MM M
#endif
#define ALPHA %xmm15
@ -216,23 +211,6 @@
movq BUFFER, X1
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L01
vmovsd (X), %xmm0
addq INCX, X
vmovsd %xmm0, 1 * SIZE(BUFFER)
addq $1 * SIZE, BUFFER
addq $2 * SIZE, X1
decq M
jle .L10
ALIGN_4
.L01:
#endif
movq M, I
sarq $3, I
jle .L05
@ -287,10 +265,6 @@
.L10:
movq Y, Y1
#ifdef ALIGNED_ACCESS
testq $SIZE, LDA
jne .L50
#endif
#if GEMV_UNROLL >= 8
cmpq $8, N
@ -316,41 +290,6 @@
vxorps %xmm7 , %xmm7, %xmm7
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L1X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm8
vmovsd -16 * SIZE(A1, LDA), %xmm9
vmovsd -16 * SIZE(A1, LDA, 2), %xmm10
vmovsd -16 * SIZE(A1, LDA3), %xmm11
vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0
vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1
vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2
vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3
vmovsd -16 * SIZE(A2), %xmm8
vmovsd -16 * SIZE(A2, LDA), %xmm9
vmovsd -16 * SIZE(A2, LDA, 2), %xmm10
vmovsd -16 * SIZE(A2, LDA3), %xmm11
vfmaddpd %xmm4, %xmm8 , %xmm12, %xmm4
vfmaddpd %xmm5, %xmm9 , %xmm12, %xmm5
vfmaddpd %xmm6, %xmm10, %xmm12, %xmm6
vfmaddpd %xmm7, %xmm11, %xmm12, %xmm7
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L1X:
#endif
movq M, I
sarq $3, I
jle .L15
@ -671,31 +610,6 @@
vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L2X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm8
vmovsd -16 * SIZE(A1, LDA), %xmm9
vmovsd -16 * SIZE(A2), %xmm10
vmovsd -16 * SIZE(A2, LDA), %xmm11
vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0
vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1
vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2
vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L2X:
#endif
movq M, I
sarq $3, I
jle .L25
@ -924,26 +838,6 @@
vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L3X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm8
vmovsd -16 * SIZE(A2), %xmm9
vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0
vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L3X:
#endif
movq M, I
sarq $3, I
jle .L35
@ -1100,21 +994,6 @@
vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L4X
movsd -16 * SIZE(X1), %xmm12
movsd -16 * SIZE(A1), %xmm8
vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0
addq $SIZE, A1
addq $SIZE, X1
ALIGN_3
.L4X:
#endif
movq M, I
sarq $3, I
@ -1223,683 +1102,6 @@
vmovlpd %xmm0, (Y1)
addq INCY, Y1
#ifdef ALIGNED_ACCESS
jmp .L999
ALIGN_4
.L50:
#if GEMV_UNROLL >= 4
cmpq $4, N
jl .L60
ALIGN_3
.L51:
subq $4, N
leaq 16 * SIZE(BUFFER), X1
movq A, A1
leaq (A1, LDA, 2), A2
leaq (A1, LDA, 4), A
vxorps %xmm0 , %xmm0, %xmm0
vxorps %xmm1 , %xmm1, %xmm1
vxorps %xmm2 , %xmm2, %xmm2
vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L5X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vmovsd -16 * SIZE(A1, LDA), %xmm5
vmovsd -16 * SIZE(A2), %xmm6
vmovsd -16 * SIZE(A2, LDA), %xmm7
vfmaddpd %xmm0, %xmm4 , %xmm12, %xmm0
vfmaddpd %xmm1, %xmm5 , %xmm12, %xmm1
vfmaddpd %xmm2, %xmm6 , %xmm12, %xmm2
vfmaddpd %xmm3, %xmm7 , %xmm12, %xmm3
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L5X:
#endif
vxorps %xmm8 , %xmm8, %xmm8
vxorps %xmm9 , %xmm9, %xmm9
vmovhpd -16 * SIZE(A1, LDA), %xmm8 , %xmm8
vmovhpd -16 * SIZE(A2, LDA), %xmm9 , %xmm9
movq M, I
sarq $3, I
jle .L55
VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
decq I
jle .L53
ALIGN_4
.L52:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
VMOVUPS_A1(-12 * SIZE, A1, %xmm4)
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5)
vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_XL1(-8 * SIZE, X1, %xmm12)
VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5)
vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
VMOVUPS_XL1(-6 * SIZE, X1, %xmm13)
VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7)
addq $8 * SIZE, A1
addq $8 * SIZE, A2
addq $8 * SIZE, X1
decq I
jg .L52
ALIGN_4
.L53:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5)
vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_XL1(-8 * SIZE, X1, %xmm12)
VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5)
vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
VMOVUPS_XL1(-6 * SIZE, X1, %xmm13)
VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7)
addq $8 * SIZE, A1
addq $8 * SIZE, A2
addq $8 * SIZE, X1
ALIGN_4
.L55:
testq $4, M
jle .L56
VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
addq $4 * SIZE, A1
addq $4 * SIZE, A2
addq $4 * SIZE, X1
ALIGN_4
.L56:
testq $2, M
jle .L57
VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
addq $2 * SIZE, A1
addq $2 * SIZE, A2
addq $2 * SIZE, X1
ALIGN_4
.L57:
testq $1, M
je .L58
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vmovsd -16 * SIZE(A2), %xmm6
vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0
vshufpd $1, %xmm8, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm12 , %xmm2
vshufpd $1, %xmm9, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
ALIGN_4
.L58:
vhaddpd %xmm1, %xmm0 , %xmm0
vhaddpd %xmm3, %xmm2 , %xmm2
vmulpd ALPHA, %xmm0 , %xmm0
vmulpd ALPHA, %xmm2 , %xmm2
cmpq $SIZE, INCY
jne .L59
vmovups 0 * SIZE(Y), %xmm4
vmovups 2 * SIZE(Y), %xmm5
addq $4 * SIZE, Y
vaddpd %xmm4, %xmm0 , %xmm0
vaddpd %xmm5, %xmm2 , %xmm2
vmovups %xmm0, 0 * SIZE(Y1)
vmovups %xmm2, 2 * SIZE(Y1)
addq $4 * SIZE, Y1
cmpq $4, N
jge .L51
jmp .L60
ALIGN_4
.L59:
vmovsd (Y), %xmm4
addq INCY, Y
vmovhpd (Y), %xmm4 , %xmm4
addq INCY, Y
vmovsd (Y), %xmm5
addq INCY, Y
vmovhpd (Y), %xmm5 , %xmm5
addq INCY, Y
vaddpd %xmm4, %xmm0 , %xmm0
vaddpd %xmm5, %xmm2 , %xmm2
vmovlpd %xmm0, (Y1)
addq INCY, Y1
vmovhpd %xmm0, (Y1)
addq INCY, Y1
vmovlpd %xmm2, (Y1)
addq INCY, Y1
vmovhpd %xmm2, (Y1)
addq INCY, Y1
cmpq $4, N
jge .L51
ALIGN_4
.L60:
#endif
#if GEMV_UNROLL >= 2
cmpq $2, N
jl .L70
#if GEMV_UNROLL == 2
ALIGN_3
.L61:
#endif
subq $2, N
leaq 16 * SIZE(BUFFER), X1
movq A, A1
leaq (A1, LDA), A2
leaq (A1, LDA, 2), A
vxorps %xmm0 , %xmm0, %xmm0
vxorps %xmm1 , %xmm1, %xmm1
vxorps %xmm2 , %xmm2, %xmm2
vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L6X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vmovsd -16 * SIZE(A2), %xmm5
vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm12 , %xmm1
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L6X:
#endif
vxorps %xmm8 , %xmm8, %xmm8
vmovhpd -16 * SIZE(A2), %xmm8 , %xmm8
movq M, I
sarq $3, I
jle .L65
VMOVUPS_A1(-15 * SIZE, A2, %xmm5)
VMOVUPS_A1(-13 * SIZE, A2, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
decq I
jle .L63
ALIGN_4
.L62:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_A1(-11 * SIZE, A2, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm7, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
VMOVUPS_A1( -9 * SIZE, A2, %xmm8)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1
VMOVUPS_XL1(-8 * SIZE, X1, %xmm12)
VMOVUPS_A1(-7 * SIZE, A2, %xmm5)
vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm9 , %xmm9
vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1
VMOVUPS_XL1(-6 * SIZE, X1, %xmm13)
VMOVUPS_A1(-5 * SIZE, A2, %xmm7)
addq $8 * SIZE, A1
addq $8 * SIZE, A2
addq $8 * SIZE, X1
decq I
jg .L62
ALIGN_4
.L63:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_A1(-11 * SIZE, A2, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm7, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
VMOVUPS_A1( -9 * SIZE, A2, %xmm8)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1
vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm9 , %xmm9
vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1
addq $8 * SIZE, A1
addq $8 * SIZE, A2
addq $8 * SIZE, X1
ALIGN_4
.L65:
testq $4, M
jle .L66
VMOVUPS_A1(-15 * SIZE, A2, %xmm5)
VMOVUPS_A1(-13 * SIZE, A2, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm7, %xmm5 , %xmm5
vmovups %xmm7, %xmm8
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
addq $4 * SIZE, A1
addq $4 * SIZE, A2
addq $4 * SIZE, X1
ALIGN_4
.L66:
testq $2, M
jle .L67
VMOVUPS_A1(-15 * SIZE, A2, %xmm5)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
movaps %xmm5, %xmm8
addq $2 * SIZE, A1
addq $2 * SIZE, A2
addq $2 * SIZE, X1
ALIGN_4
.L67:
testq $1, M
je .L68
vmovsd -16 * SIZE(X1), %xmm12
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm8, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
ALIGN_4
.L68:
vaddpd %xmm2, %xmm0 , %xmm0
vaddpd %xmm3, %xmm1 , %xmm1
vhaddpd %xmm1, %xmm0 , %xmm0
vmulpd ALPHA, %xmm0 , %xmm0
vmovsd (Y), %xmm4
addq INCY, Y
vmovhpd (Y), %xmm4 , %xmm4
addq INCY, Y
vaddpd %xmm4, %xmm0 , %xmm0
vmovlpd %xmm0, (Y1)
addq INCY, Y1
vmovhpd %xmm0, (Y1)
addq INCY, Y1
#if GEMV_UNROLL == 2
cmpq $2, N
jge .L61
#endif
ALIGN_4
.L70:
cmpq $1, N
jl .L999
#endif
leaq 16 * SIZE(BUFFER), X1
movq A, A1
vxorps %xmm0 , %xmm0, %xmm0
vxorps %xmm1 , %xmm1, %xmm1
vxorps %xmm2 , %xmm2, %xmm2
vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L7X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0
addq $SIZE, A1
addq $SIZE, X1
ALIGN_3
.L7X:
#endif
movq M, I
sarq $3, I
jle .L75
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
decq I
jle .L73
ALIGN_4
.L72:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2
VMOVUPS_XL1( -8 * SIZE, X1, %xmm12)
VMOVUPS_XL1( -6 * SIZE, X1, %xmm13)
addq $8 * SIZE, A1
addq $8 * SIZE, X1
decq I
jg .L72
ALIGN_4
.L73:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2
addq $8 * SIZE, A1
addq $8 * SIZE, X1
ALIGN_4
.L75:
testq $4, M
jle .L76
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2
addq $4 * SIZE, A1
addq $4 * SIZE, X1
ALIGN_4
.L76:
testq $2, M
jle .L77
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
addq $2 * SIZE, A1
addq $2 * SIZE, X1
ALIGN_4
.L77:
testq $1, M
je .L78
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0
ALIGN_4
.L78:
vaddpd %xmm2, %xmm0 , %xmm0
vaddpd %xmm3, %xmm1 , %xmm1
vaddpd %xmm1, %xmm0 , %xmm0
vhaddpd %xmm1, %xmm0 , %xmm0
vmulsd ALPHA, %xmm0 , %xmm0
vmovsd (Y), %xmm4
addq INCY, Y
vaddsd %xmm4, %xmm0 , %xmm0
vmovlpd %xmm0, (Y1)
addq INCY, Y1
#endif
ALIGN_4
.L999:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -949,6 +949,9 @@
.L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
#endif
decq J // j --
jg .L2_01 // next 2 lines of N