This commit is contained in:
wernsaar 2013-08-08 09:03:35 -07:00
commit b3220e63e2
11 changed files with 6672 additions and 1912 deletions

View File

@ -582,6 +582,24 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
ifdef STRMMKERNEL
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(STRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else else
$(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -595,17 +613,79 @@ $(KDIR)strmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL) $(KDIR)strmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(SGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
ifdef DTRMMKERNEL
ifdef DTRMMKERNEL_LN
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_LT
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_LT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RN
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RT
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL_RT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
else
ifdef DTRMMKERNEL_LN
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_LT
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_LT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RN
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RN)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
endif
ifdef DTRMMKERNEL_RT
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL_RT)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL) $(KDIR)dtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(DGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
endif
ifdef QTRMMKERNEL
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
@ -619,6 +699,50 @@ $(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL) $(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
else
$(KDIR)qtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -DLEFT -DTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -UTRANSA $< -o $@
$(KDIR)qtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(QGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -UCOMPLEX -ULEFT -DTRANSA $< -o $@
endif
ifdef CTRMMKERNEL
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ctrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(KDIR)ctrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -643,6 +767,37 @@ $(KDIR)ctrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL) $(KDIR)ctrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -UDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
ifdef ZTRMMKERNEL
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)ztrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -666,7 +821,37 @@ $(KDIR)ztrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL) $(KDIR)ztrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(ZGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif endif
endif
ifdef XTRMMKERNEL
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_LT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_LR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -DCONJ -DCN $< -o $@
$(KDIR)xtrmm_kernel_LC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -DTRANSA -DCONJ -DCN $< -o $@
$(KDIR)xtrmm_kernel_RN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_RT$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -UCONJ -DNN $< -o $@
$(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -UTRANSA -DCONJ -DNC $< -o $@
$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XTRMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
else
$(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_LN$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -DLEFT -UTRANSA -UCONJ -DNN $< -o $@
@ -692,6 +877,9 @@ $(KDIR)xtrmm_kernel_RR$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL) $(KDIR)xtrmm_kernel_RC$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(XGEMMKERNEL)
$(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@ $(CC) $(CFLAGS) -c -DTRMMKERNEL -DXDOUBLE -DCOMPLEX -ULEFT -DTRANSA -DCONJ -DNC $< -o $@
endif
$(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL) $(KDIR)cgemm3m_kernel$(TSUFFIX).$(SUFFIX) : $(KERNELDIR)/$(CGEMM3MKERNEL)
$(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@ $(CC) $(CFLAGS) -c -UDOUBLE -DCOMPLEX -DNN $< -o $@

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,750 @@
#include "common.h"
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
{
BLASLONG i,j,k;
FLOAT *C0,*C1,*ptrba,*ptrbb;
FLOAT res0_0;
FLOAT res0_1;
FLOAT res0_2;
FLOAT res0_3;
FLOAT res0_4;
FLOAT res0_5;
FLOAT res0_6;
FLOAT res0_7;
FLOAT res1_0;
FLOAT res1_1;
FLOAT res1_2;
FLOAT res1_3;
FLOAT res1_4;
FLOAT res1_5;
FLOAT res1_6;
FLOAT res1_7;
FLOAT a0;
FLOAT a1;
FLOAT b0;
FLOAT b1;
BLASLONG off, temp;
#if !defined(LEFT)
off = -offset;
#endif
for (j=0; j<bn/2; j+=1)
{
C0 = C;
C1 = C0+ldc;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/8; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*8;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res0_4 = 0;
res0_5 = 0;
res0_6 = 0;
res0_7 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
res1_4 = 0;
res1_5 = 0;
res1_6 = 0;
res1_7 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+8; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
a0 = ptrba[4];
res0_4 += a0*b0;
res1_4 += a0*b1;
a1 = ptrba[5];
res0_5 += a1*b0;
res1_5 += a1*b1;
a0 = ptrba[6];
res0_6 += a0*b0;
res1_6 += a0*b1;
a1 = ptrba[7];
res0_7 += a1*b0;
res1_7 += a1*b1;
ptrba = ptrba+8;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res0_4 *= alpha;
res0_5 *= alpha;
res0_6 *= alpha;
res0_7 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
res1_4 *= alpha;
res1_5 *= alpha;
res1_6 *= alpha;
res1_7 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
C0[4] = res0_4;
C0[5] = res0_5;
C0[6] = res0_6;
C0[7] = res0_7;
C1[0] = res1_0;
C1[1] = res1_1;
C1[2] = res1_2;
C1[3] = res1_3;
C1[4] = res1_4;
C1[5] = res1_5;
C1[6] = res1_6;
C1[7] = res1_7;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 8; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*8;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 8; // number of values in A
#endif
C0 = C0+8;
C1 = C1+8;
}
if ( bm & 4 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res1_0 = 0;
res1_1 = 0;
res1_2 = 0;
res1_3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
a0 = ptrba[2];
res0_2 += a0*b0;
res1_2 += a0*b1;
a1 = ptrba[3];
res0_3 += a1*b0;
res1_3 += a1*b1;
ptrba = ptrba+4;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
res1_2 *= alpha;
res1_3 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
C1[0] = res1_0;
C1[1] = res1_1;
C1[2] = res1_2;
C1[3] = res1_3;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*4;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
C0 = C0+4;
C1 = C1+4;
}
if ( bm & 2 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res0_1 = 0;
res1_0 = 0;
res1_1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
a1 = ptrba[1];
res0_1 += a1*b0;
res1_1 += a1*b1;
ptrba = ptrba+2;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res0_1 *= alpha;
res1_0 *= alpha;
res1_1 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C1[0] = res1_0;
C1[1] = res1_1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*2;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
C0 = C0+2;
C1 = C1+2;
}
if ( bm & 1 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1;
ptrbb = bb + off*2;
#endif
res0_0 = 0;
res1_0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+2; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
b1 = ptrbb[1];
a0 = ptrba[0];
res0_0 += a0*b0;
res1_0 += a0*b1;
ptrba = ptrba+1;
ptrbb = ptrbb+2;
}
res0_0 *= alpha;
res1_0 *= alpha;
C0[0] = res0_0;
C1[0] = res1_0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 2; // number of values in B
#endif
ptrba += temp*1;
ptrbb += temp*2;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
C0 = C0+1;
C1 = C1+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 2;
#endif
k = (bk<<1);
bb = bb+k;
i = (ldc<<1);
C = C+i;
}
for (j=0; j<(bn&1); j+=1)
{
C0 = C;
#if defined(TRMMKERNEL) && defined(LEFT)
off = offset;
#endif
ptrba = ba;
for (i=0; i<bm/8; i+=1)
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*8;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
res0_4 = 0;
res0_5 = 0;
res0_6 = 0;
res0_7 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+8; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
a0 = ptrba[2];
res0_2 += a0*b0;
a1 = ptrba[3];
res0_3 += a1*b0;
a0 = ptrba[4];
res0_4 += a0*b0;
a1 = ptrba[5];
res0_5 += a1*b0;
a0 = ptrba[6];
res0_6 += a0*b0;
a1 = ptrba[7];
res0_7 += a1*b0;
ptrba = ptrba+8;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
res0_4 *= alpha;
res0_5 *= alpha;
res0_6 *= alpha;
res0_7 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
C0[4] = res0_4;
C0[5] = res0_5;
C0[6] = res0_6;
C0[7] = res0_7;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 8; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*8;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 8; // number of values in A
#endif
C0 = C0+8;
}
if ( bm & 4 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*4;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
res0_2 = 0;
res0_3 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+4; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
a0 = ptrba[2];
res0_2 += a0*b0;
a1 = ptrba[3];
res0_3 += a1*b0;
ptrba = ptrba+4;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
res0_2 *= alpha;
res0_3 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
C0[2] = res0_2;
C0[3] = res0_3;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 4; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*4;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 4; // number of values in A
#endif
C0 = C0+4;
}
if ( bm & 2 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*2;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
res0_1 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+2; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
a1 = ptrba[1];
res0_1 += a1*b0;
ptrba = ptrba+2;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
res0_1 *= alpha;
C0[0] = res0_0;
C0[1] = res0_1;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 2; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*2;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 2; // number of values in A
#endif
C0 = C0+2;
}
if ( bm & 1 )
{
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
ptrbb = bb;
#else
ptrba += off*1;
ptrbb = bb + off*1;
#endif
res0_0 = 0;
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
temp = bk-off;
#elif defined(LEFT)
temp = off+1; // number of values in A
#else
temp = off+1; // number of values in B
#endif
for (k=0; k<temp; k++)
{
b0 = ptrbb[0];
a0 = ptrba[0];
res0_0 += a0*b0;
ptrba = ptrba+1;
ptrbb = ptrbb+1;
}
res0_0 *= alpha;
C0[0] = res0_0;
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
temp = bk - off;
#ifdef LEFT
temp -= 1; // number of values in A
#else
temp -= 1; // number of values in B
#endif
ptrba += temp*1;
ptrbb += temp*1;
#endif
#ifdef LEFT
off += 1; // number of values in A
#endif
C0 = C0+1;
}
#if defined(TRMMKERNEL) && !defined(LEFT)
off += 1;
#endif
k = (bk<<0);
bb = bb+k;
C = C+ldc;
}
return 0;
}

View File

@ -53,8 +53,8 @@ STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c DTRSMKERNEL_LT = dtrsm_kernel_LT_8x2_bulldozer.S
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c DTRSMKERNEL_RN = dtrsm_kernel_RN_8x2_bulldozer.S
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c

View File

@ -1255,6 +1255,9 @@
.L2_60: .L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
#endif
decq J // j -- decq J // j --
jg .L2_01 // next 2 lines of N jg .L2_01 // next 2 lines of N

File diff suppressed because it is too large Load Diff

View File

@ -40,7 +40,6 @@
#include "common.h" #include "common.h"
#include "l2param.h" #include "l2param.h"
// #undef ALIGNED_ACCESS
#define A_PRE 256 #define A_PRE 256
@ -111,11 +110,7 @@
#define Y1 %rbp #define Y1 %rbp
#define X1 %r15 #define X1 %r15
#ifdef ALIGNED_ACCESS
#define MM INCX
#else
#define MM M #define MM M
#endif
#define ALPHA %xmm15 #define ALPHA %xmm15
@ -216,23 +211,6 @@
movq BUFFER, X1 movq BUFFER, X1
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L01
vmovsd (X), %xmm0
addq INCX, X
vmovsd %xmm0, 1 * SIZE(BUFFER)
addq $1 * SIZE, BUFFER
addq $2 * SIZE, X1
decq M
jle .L10
ALIGN_4
.L01:
#endif
movq M, I movq M, I
sarq $3, I sarq $3, I
jle .L05 jle .L05
@ -287,10 +265,6 @@
.L10: .L10:
movq Y, Y1 movq Y, Y1
#ifdef ALIGNED_ACCESS
testq $SIZE, LDA
jne .L50
#endif
#if GEMV_UNROLL >= 8 #if GEMV_UNROLL >= 8
cmpq $8, N cmpq $8, N
@ -316,41 +290,6 @@
vxorps %xmm7 , %xmm7, %xmm7 vxorps %xmm7 , %xmm7, %xmm7
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L1X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm8
vmovsd -16 * SIZE(A1, LDA), %xmm9
vmovsd -16 * SIZE(A1, LDA, 2), %xmm10
vmovsd -16 * SIZE(A1, LDA3), %xmm11
vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0
vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1
vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2
vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3
vmovsd -16 * SIZE(A2), %xmm8
vmovsd -16 * SIZE(A2, LDA), %xmm9
vmovsd -16 * SIZE(A2, LDA, 2), %xmm10
vmovsd -16 * SIZE(A2, LDA3), %xmm11
vfmaddpd %xmm4, %xmm8 , %xmm12, %xmm4
vfmaddpd %xmm5, %xmm9 , %xmm12, %xmm5
vfmaddpd %xmm6, %xmm10, %xmm12, %xmm6
vfmaddpd %xmm7, %xmm11, %xmm12, %xmm7
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L1X:
#endif
movq M, I movq M, I
sarq $3, I sarq $3, I
jle .L15 jle .L15
@ -671,31 +610,6 @@
vxorps %xmm3 , %xmm3, %xmm3 vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L2X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm8
vmovsd -16 * SIZE(A1, LDA), %xmm9
vmovsd -16 * SIZE(A2), %xmm10
vmovsd -16 * SIZE(A2, LDA), %xmm11
vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0
vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1
vfmaddpd %xmm2, %xmm10, %xmm12, %xmm2
vfmaddpd %xmm3, %xmm11, %xmm12, %xmm3
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L2X:
#endif
movq M, I movq M, I
sarq $3, I sarq $3, I
jle .L25 jle .L25
@ -924,26 +838,6 @@
vxorps %xmm3 , %xmm3, %xmm3 vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L3X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm8
vmovsd -16 * SIZE(A2), %xmm9
vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0
vfmaddpd %xmm1, %xmm9 , %xmm12, %xmm1
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L3X:
#endif
movq M, I movq M, I
sarq $3, I sarq $3, I
jle .L35 jle .L35
@ -1100,21 +994,6 @@
vxorps %xmm3 , %xmm3, %xmm3 vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L4X
movsd -16 * SIZE(X1), %xmm12
movsd -16 * SIZE(A1), %xmm8
vfmaddpd %xmm0, %xmm8 , %xmm12, %xmm0
addq $SIZE, A1
addq $SIZE, X1
ALIGN_3
.L4X:
#endif
movq M, I movq M, I
sarq $3, I sarq $3, I
@ -1223,683 +1102,6 @@
vmovlpd %xmm0, (Y1) vmovlpd %xmm0, (Y1)
addq INCY, Y1 addq INCY, Y1
#ifdef ALIGNED_ACCESS
jmp .L999
ALIGN_4
.L50:
#if GEMV_UNROLL >= 4
cmpq $4, N
jl .L60
ALIGN_3
.L51:
subq $4, N
leaq 16 * SIZE(BUFFER), X1
movq A, A1
leaq (A1, LDA, 2), A2
leaq (A1, LDA, 4), A
vxorps %xmm0 , %xmm0, %xmm0
vxorps %xmm1 , %xmm1, %xmm1
vxorps %xmm2 , %xmm2, %xmm2
vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L5X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vmovsd -16 * SIZE(A1, LDA), %xmm5
vmovsd -16 * SIZE(A2), %xmm6
vmovsd -16 * SIZE(A2, LDA), %xmm7
vfmaddpd %xmm0, %xmm4 , %xmm12, %xmm0
vfmaddpd %xmm1, %xmm5 , %xmm12, %xmm1
vfmaddpd %xmm2, %xmm6 , %xmm12, %xmm2
vfmaddpd %xmm3, %xmm7 , %xmm12, %xmm3
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L5X:
#endif
vxorps %xmm8 , %xmm8, %xmm8
vxorps %xmm9 , %xmm9, %xmm9
vmovhpd -16 * SIZE(A1, LDA), %xmm8 , %xmm8
vmovhpd -16 * SIZE(A2, LDA), %xmm9 , %xmm9
movq M, I
sarq $3, I
jle .L55
VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
decq I
jle .L53
ALIGN_4
.L52:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
VMOVUPS_A1(-12 * SIZE, A1, %xmm4)
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5)
vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_XL1(-8 * SIZE, X1, %xmm12)
VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5)
vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
VMOVUPS_XL1(-6 * SIZE, X1, %xmm13)
VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7)
addq $8 * SIZE, A1
addq $8 * SIZE, A2
addq $8 * SIZE, X1
decq I
jg .L52
ALIGN_4
.L53:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_A2(-11 * SIZE, A1, LDA, 1, %xmm5)
vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
VMOVUPS_A2(-11 * SIZE, A2, LDA, 1, %xmm7)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2( -9 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -12 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_XL1(-8 * SIZE, X1, %xmm12)
VMOVUPS_A2( -9 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_A2(-7 * SIZE, A1, LDA, 1, %xmm5)
vfmaddpd %xmm2 , -10 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
VMOVUPS_XL1(-6 * SIZE, X1, %xmm13)
VMOVUPS_A2(-7 * SIZE, A2, LDA, 1, %xmm7)
addq $8 * SIZE, A1
addq $8 * SIZE, A2
addq $8 * SIZE, X1
ALIGN_4
.L55:
testq $4, M
jle .L56
VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_A2(-13 * SIZE, A1, LDA, 1, %xmm8)
vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
VMOVUPS_A2(-13 * SIZE, A2, LDA, 1, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
vfmaddpd %xmm2 , -14 * SIZE(A2) , %xmm13 , %xmm2
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm3 , %xmm7 , %xmm13 , %xmm3
addq $4 * SIZE, A1
addq $4 * SIZE, A2
addq $4 * SIZE, X1
ALIGN_4
.L56:
testq $2, M
jle .L57
VMOVUPS_A2(-15 * SIZE, A1, LDA, 1, %xmm5)
VMOVUPS_A2(-15 * SIZE, A2, LDA, 1, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
vfmaddpd %xmm2 , -16 * SIZE(A2) , %xmm12 , %xmm2
vshufpd $1, %xmm7, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
addq $2 * SIZE, A1
addq $2 * SIZE, A2
addq $2 * SIZE, X1
ALIGN_4
.L57:
testq $1, M
je .L58
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vmovsd -16 * SIZE(A2), %xmm6
vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0
vshufpd $1, %xmm8, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
vfmaddpd %xmm2 , %xmm6 , %xmm12 , %xmm2
vshufpd $1, %xmm9, %xmm9 , %xmm9
vfmaddpd %xmm3 , %xmm9 , %xmm12 , %xmm3
ALIGN_4
.L58:
vhaddpd %xmm1, %xmm0 , %xmm0
vhaddpd %xmm3, %xmm2 , %xmm2
vmulpd ALPHA, %xmm0 , %xmm0
vmulpd ALPHA, %xmm2 , %xmm2
cmpq $SIZE, INCY
jne .L59
vmovups 0 * SIZE(Y), %xmm4
vmovups 2 * SIZE(Y), %xmm5
addq $4 * SIZE, Y
vaddpd %xmm4, %xmm0 , %xmm0
vaddpd %xmm5, %xmm2 , %xmm2
vmovups %xmm0, 0 * SIZE(Y1)
vmovups %xmm2, 2 * SIZE(Y1)
addq $4 * SIZE, Y1
cmpq $4, N
jge .L51
jmp .L60
ALIGN_4
.L59:
vmovsd (Y), %xmm4
addq INCY, Y
vmovhpd (Y), %xmm4 , %xmm4
addq INCY, Y
vmovsd (Y), %xmm5
addq INCY, Y
vmovhpd (Y), %xmm5 , %xmm5
addq INCY, Y
vaddpd %xmm4, %xmm0 , %xmm0
vaddpd %xmm5, %xmm2 , %xmm2
vmovlpd %xmm0, (Y1)
addq INCY, Y1
vmovhpd %xmm0, (Y1)
addq INCY, Y1
vmovlpd %xmm2, (Y1)
addq INCY, Y1
vmovhpd %xmm2, (Y1)
addq INCY, Y1
cmpq $4, N
jge .L51
ALIGN_4
.L60:
#endif
#if GEMV_UNROLL >= 2
cmpq $2, N
jl .L70
#if GEMV_UNROLL == 2
ALIGN_3
.L61:
#endif
subq $2, N
leaq 16 * SIZE(BUFFER), X1
movq A, A1
leaq (A1, LDA), A2
leaq (A1, LDA, 2), A
vxorps %xmm0 , %xmm0, %xmm0
vxorps %xmm1 , %xmm1, %xmm1
vxorps %xmm2 , %xmm2, %xmm2
vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L6X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vmovsd -16 * SIZE(A2), %xmm5
vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0
vfmaddpd %xmm1 , %xmm5 , %xmm12 , %xmm1
addq $SIZE, A1
addq $SIZE, A2
addq $SIZE, X1
ALIGN_3
.L6X:
#endif
vxorps %xmm8 , %xmm8, %xmm8
vmovhpd -16 * SIZE(A2), %xmm8 , %xmm8
movq M, I
sarq $3, I
jle .L65
VMOVUPS_A1(-15 * SIZE, A2, %xmm5)
VMOVUPS_A1(-13 * SIZE, A2, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
decq I
jle .L63
ALIGN_4
.L62:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_A1(-11 * SIZE, A2, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm7, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
VMOVUPS_A1( -9 * SIZE, A2, %xmm8)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1
VMOVUPS_XL1(-8 * SIZE, X1, %xmm12)
VMOVUPS_A1(-7 * SIZE, A2, %xmm5)
vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm9 , %xmm9
vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1
VMOVUPS_XL1(-6 * SIZE, X1, %xmm13)
VMOVUPS_A1(-5 * SIZE, A2, %xmm7)
addq $8 * SIZE, A1
addq $8 * SIZE, A2
addq $8 * SIZE, X1
decq I
jg .L62
ALIGN_4
.L63:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_A1(-11 * SIZE, A2, %xmm9)
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm7, %xmm5 , %xmm5
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
VMOVUPS_A1( -9 * SIZE, A2, %xmm8)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm9, %xmm7 , %xmm7
vfmaddpd %xmm1 , %xmm7 , %xmm12 , %xmm1
vfmaddpd %xmm0 , -10 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm8, %xmm9 , %xmm9
vfmaddpd %xmm1 , %xmm9 , %xmm13 , %xmm1
addq $8 * SIZE, A1
addq $8 * SIZE, A2
addq $8 * SIZE, X1
ALIGN_4
.L65:
testq $4, M
jle .L66
VMOVUPS_A1(-15 * SIZE, A2, %xmm5)
VMOVUPS_A1(-13 * SIZE, A2, %xmm7)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
vfmaddpd %xmm0 , -14 * SIZE(A1) , %xmm13 , %xmm0
vshufpd $1, %xmm7, %xmm5 , %xmm5
vmovups %xmm7, %xmm8
vfmaddpd %xmm1 , %xmm5 , %xmm13 , %xmm1
addq $4 * SIZE, A1
addq $4 * SIZE, A2
addq $4 * SIZE, X1
ALIGN_4
.L66:
testq $2, M
jle .L67
VMOVUPS_A1(-15 * SIZE, A2, %xmm5)
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm5, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
movaps %xmm5, %xmm8
addq $2 * SIZE, A1
addq $2 * SIZE, A2
addq $2 * SIZE, X1
ALIGN_4
.L67:
testq $1, M
je .L68
vmovsd -16 * SIZE(X1), %xmm12
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vshufpd $1, %xmm8, %xmm8 , %xmm8
vfmaddpd %xmm1 , %xmm8 , %xmm12 , %xmm1
ALIGN_4
.L68:
vaddpd %xmm2, %xmm0 , %xmm0
vaddpd %xmm3, %xmm1 , %xmm1
vhaddpd %xmm1, %xmm0 , %xmm0
vmulpd ALPHA, %xmm0 , %xmm0
vmovsd (Y), %xmm4
addq INCY, Y
vmovhpd (Y), %xmm4 , %xmm4
addq INCY, Y
vaddpd %xmm4, %xmm0 , %xmm0
vmovlpd %xmm0, (Y1)
addq INCY, Y1
vmovhpd %xmm0, (Y1)
addq INCY, Y1
#if GEMV_UNROLL == 2
cmpq $2, N
jge .L61
#endif
ALIGN_4
.L70:
cmpq $1, N
jl .L999
#endif
leaq 16 * SIZE(BUFFER), X1
movq A, A1
vxorps %xmm0 , %xmm0, %xmm0
vxorps %xmm1 , %xmm1, %xmm1
vxorps %xmm2 , %xmm2, %xmm2
vxorps %xmm3 , %xmm3, %xmm3
#ifdef ALIGNED_ACCESS
testq $SIZE, A
je .L7X
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0
addq $SIZE, A1
addq $SIZE, X1
ALIGN_3
.L7X:
#endif
movq M, I
sarq $3, I
jle .L75
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
decq I
jle .L73
ALIGN_4
.L72:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2
VMOVUPS_XL1( -8 * SIZE, X1, %xmm12)
VMOVUPS_XL1( -6 * SIZE, X1, %xmm13)
addq $8 * SIZE, A1
addq $8 * SIZE, X1
decq I
jg .L72
ALIGN_4
.L73:
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2
VMOVUPS_XL1(-12 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-10 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -12 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -10 * SIZE(A1) , %xmm13 , %xmm2
addq $8 * SIZE, A1
addq $8 * SIZE, X1
ALIGN_4
.L75:
testq $4, M
jle .L76
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
VMOVUPS_XL1(-14 * SIZE, X1, %xmm13)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
vfmaddpd %xmm2 , -14 * SIZE(A1) , %xmm13 , %xmm2
addq $4 * SIZE, A1
addq $4 * SIZE, X1
ALIGN_4
.L76:
testq $2, M
jle .L77
VMOVUPS_XL1(-16 * SIZE, X1, %xmm12)
vfmaddpd %xmm0 , -16 * SIZE(A1) , %xmm12 , %xmm0
addq $2 * SIZE, A1
addq $2 * SIZE, X1
ALIGN_4
.L77:
testq $1, M
je .L78
vmovsd -16 * SIZE(X1), %xmm12
vmovsd -16 * SIZE(A1), %xmm4
vfmaddpd %xmm0 , %xmm4 , %xmm12 , %xmm0
ALIGN_4
.L78:
vaddpd %xmm2, %xmm0 , %xmm0
vaddpd %xmm3, %xmm1 , %xmm1
vaddpd %xmm1, %xmm0 , %xmm0
vhaddpd %xmm1, %xmm0 , %xmm0
vmulsd ALPHA, %xmm0 , %xmm0
vmovsd (Y), %xmm4
addq INCY, Y
vaddsd %xmm4, %xmm0 , %xmm0
vmovlpd %xmm0, (Y1)
addq INCY, Y1
#endif
ALIGN_4 ALIGN_4
.L999: .L999:

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -949,6 +949,9 @@
.L2_60: .L2_60:
#if defined(TRMMKERNEL) && !defined(LEFT)
addq $2, KK
#endif
decq J // j -- decq J // j --
jg .L2_01 // next 2 lines of N jg .L2_01 // next 2 lines of N