diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3
index fdbae2daa..4ef351de3 100644
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@@ -32,6 +32,10 @@ ifeq ($(TARGET), GENERIC)
 USE_TRMM = 1
 endif
 
+ifeq ($(CORE), HASWELL)
+USE_TRMM = 1
+endif
+
 
 
 SKERNELOBJS	+= \
diff --git a/kernel/generic/trmmkernel_4x8.c b/kernel/generic/trmmkernel_4x8.c
new file mode 100644
index 000000000..09c47f147
--- /dev/null
+++ b/kernel/generic/trmmkernel_4x8.c
@@ -0,0 +1,1402 @@
+#include "common.h"
+#include <stdbool.h>
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+
+   FLOAT res4_0;
+   FLOAT res4_1;
+   FLOAT res4_2;
+   FLOAT res4_3;
+
+   FLOAT res5_0;
+   FLOAT res5_1;
+   FLOAT res5_2;
+   FLOAT res5_3;
+
+   FLOAT res6_0;
+   FLOAT res6_1;
+   FLOAT res6_2;
+   FLOAT res6_3;
+
+   FLOAT res7_0;
+   FLOAT res7_1;
+   FLOAT res7_2;
+   FLOAT res7_3;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+   FLOAT b4;
+   FLOAT b5;
+   FLOAT b6;
+   FLOAT b7;
+
+   BLASLONG off, temp;
+
+   bool left;
+   bool transposed;
+   bool backwards;
+
+#ifdef LEFT
+   left = true;
+#else
+   left = false;
+#endif
+
+#ifdef TRANSA
+   transposed = true;
+#else
+   transposed = false;
+#endif
+
+   backwards = left != transposed;
+
+   if (!left) {
+      off = -offset;
+   }
+
+
+   for (j=0; j<bn/8; j+=1) // do blocks of the Mx8 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+        C4 = C3+ldc;
+        C5 = C4+ldc;
+        C6 = C5+ldc;
+        C7 = C6+ldc;
+
+
+        if (left) {
+            off = offset;
+        }
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+	{
+
+		ptrbb = bb;
+                if (backwards)
+                {
+		   ptrba += off*4; // number of values in A
+		   ptrbb += off*8; // number of values in B
+                }
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+		res4_2 = 0;
+		res4_3 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		res5_2 = 0;
+		res5_3 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+		res6_2 = 0;
+		res6_3 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+		res7_2 = 0;
+		res7_3 = 0;
+
+                temp = backwards ? bk-off :
+                             left ? off + 4 : // number of values in A
+                                    off + 8;  // number of values in B
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+
+		if (!backwards) {
+                    temp = bk-off;
+                    temp = left ? temp - 4 : // number of values in A
+                                  temp - 8;  // number of values in B
+
+                    ptrba += temp*4; // number of values in A
+		    ptrbb += temp*8; // number of values in B
+                }
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+		C4 = C4+4;
+		C5 = C5+4;
+		C6 = C6+4;
+		C7 = C7+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+		C4 = C4+2;
+		C5 = C5+2;
+		C6 = C6+2;
+		C7 = C7+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+		res4_0 = 0;
+		res5_0 = 0;
+		res6_0 = 0;
+		res7_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+		res4_0 *= alpha;
+		res5_0 *= alpha;
+		res6_0 *= alpha;
+		res7_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+		C4[0] = res4_0;
+		C5[0] = res5_0;
+		C6[0] = res6_0;
+		C7[0] = res7_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+		C4 = C4+1;
+		C5 = C5+1;
+		C6 = C6+1;
+		C7 = C7+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 8;
+#endif
+
+        k = (bk<<3);
+        bb = bb+k;
+        i = (ldc<<3);
+        C = C+i;
+    }
+
+
+
+   for (j=0; j<(bn&4); j+=4) // do blocks of the Mx4 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+
+        if (left) {
+            off = offset;
+        }
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+	{
+
+		ptrbb = bb;
+                if (backwards)
+                {
+		   ptrba += off*4; // number of values in A
+		   ptrbb += off*4; // number of values in B
+                }
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+                temp = backwards ? bk-off :
+                             left ? off + 4 : // number of values in A
+                                    off + 4;  // number of values in B
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		if (!backwards) {
+                    temp = bk-off;
+                    temp = left ? temp - 4 : // number of values in A
+                                  temp - 4;  // number of values in B
+
+                    ptrba += temp*4; // number of values in A
+		    ptrbb += temp*4; // number of values in B
+                }
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+
+   for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+		off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/kernel/x86_64/KERNEL.HASWELL b/kernel/x86_64/KERNEL.HASWELL
index d1f34cc7b..a4686debb 100644
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -29,6 +29,7 @@ DAXPYKERNEL = daxpy.c
 CAXPYKERNEL = caxpy.c
 ZAXPYKERNEL = zaxpy.c
 
+STRMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMKERNEL    =  sgemm_kernel_16x4_haswell.S
 SGEMMINCOPY    =  ../generic/gemm_ncopy_16.c
 SGEMMITCOPY    =  ../generic/gemm_tcopy_16.c
@@ -39,16 +40,18 @@ SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
 SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
 SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
-DGEMMKERNEL    =  dgemm_kernel_4x4_haswell.S
-DGEMMINCOPY    =
-DGEMMITCOPY    =
-DGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
-DGEMMINCOPYOBJ =
-DGEMMITCOPYOBJ =
+DTRMMKERNEL    =  dtrmm_kernel_4x8_haswell.c
+DGEMMKERNEL    =  dgemm_kernel_4x8_haswell.S
+DGEMMINCOPY    =  ../generic/gemm_ncopy_4.c
+DGEMMITCOPY    =  ../generic/gemm_tcopy_4.c
+DGEMMONCOPY    =  ../generic/gemm_ncopy_8.c
+DGEMMOTCOPY    =  ../generic/gemm_tcopy_8.c
+DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
 DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
 DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+CTRMMKERNEL    =  cgemm_kernel_8x2_haswell.S
 CGEMMKERNEL    =  cgemm_kernel_8x2_haswell.S
 CGEMMINCOPY    =  ../generic/zgemm_ncopy_8.c
 CGEMMITCOPY    =  ../generic/zgemm_tcopy_8.c
@@ -59,6 +62,7 @@ CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 
+ZTRMMKERNEL    =  zgemm_kernel_4x2_haswell.S
 ZGEMMKERNEL    =  zgemm_kernel_4x2_haswell.S
 ZGEMMINCOPY    =  ../generic/zgemm_ncopy_4.c
 ZGEMMITCOPY    =  ../generic/zgemm_tcopy_4.c
diff --git a/kernel/x86_64/dgemm_kernel_4x8_haswell.S b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
new file mode 100644
index 000000000..c84b599ce
--- /dev/null
+++ b/kernel/x86_64/dgemm_kernel_4x8_haswell.S
@@ -0,0 +1,4753 @@
+/*********************************************************************************
+Copyright (c) 2015, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+
+#define ASSEMBLER
+#include "common.h"
+ 
+#define OLD_M	%rdi
+#define OLD_N	%rsi
+#define M	%r13
+#define J	%r14
+#define OLD_K	%rdx
+
+#define A	%rcx
+#define B	%r8
+#define C	%r9
+#define LDC	%r10
+	
+#define I	%r11
+#define AO	%rdi
+#define BO	%rsi
+#define	CO1	%r15
+#define K	%r12
+#define	SP	%rbx
+
+#define BO1	%rdi
+#define BO2	%r15
+#define BO3	%rbp
+
+#ifndef WINDOWS_ABI
+
+#define STACKSIZE 96
+#define L_BUFFER_SIZE 256*8*12+4096
+
+#else
+
+#define STACKSIZE 256
+#define L_BUFFER_SIZE 128*8*12+512
+
+#define OLD_A		40 + STACKSIZE(%rsp)
+#define OLD_B		48 + STACKSIZE(%rsp)
+#define OLD_C		56 + STACKSIZE(%rsp)
+#define OLD_LDC		64 + STACKSIZE(%rsp)
+#define OLD_OFFSET	72 + STACKSIZE(%rsp)
+
+#endif
+
+
+#define Ndiv12	 24(%rsp)
+#define Nmod12	 32(%rsp)
+#define N	 40(%rsp)
+#define ALPHA	 48(%rsp)
+#define OFFSET	 56(%rsp)
+#define KK	 64(%rsp)
+#define KKK	 72(%rsp)
+#define BUFFER1	           128(%rsp)
+
+#if defined(OS_WINDOWS)
+#if   L_BUFFER_SIZE > 16384
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 4(%rsp);\
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 12288
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 3(%rsp);\
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 8192
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 2(%rsp);\
+        movl    $ 0,  4096 * 1(%rsp);
+#elif L_BUFFER_SIZE > 4096
+#define STACK_TOUCH \
+        movl    $ 0,  4096 * 1(%rsp);
+#else
+#define STACK_TOUCH
+#endif
+#else
+#define STACK_TOUCH
+#endif
+
+#define	A_PR1	512
+#define	B_PR1	512
+
+/*******************************************************************************************
+* Macro definitions
+*******************************************************************************************/
+
+.macro INIT4x12
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+	vxorpd		%ymm8 , %ymm8 , %ymm8
+	vxorpd		%ymm9 , %ymm9 , %ymm9
+	vxorpd		%ymm10, %ymm10, %ymm10
+	vxorpd		%ymm11, %ymm11, %ymm11
+	vxorpd		%ymm12, %ymm12, %ymm12
+	vxorpd		%ymm13, %ymm13, %ymm13
+	vxorpd		%ymm14, %ymm14, %ymm14
+	vxorpd		%ymm15, %ymm15, %ymm15
+
+.endm
+
+.macro KERNEL4x12_I
+	prefetcht0	A_PR1(AO)
+	vmovups		-12 * SIZE(BO), %ymm1
+	prefetcht0	B_PR1(BO)
+	vmovups 	-16 * SIZE(AO), %ymm0
+	prefetcht0	B_PR1+64(BO)
+	vmovups		 -8 * SIZE(BO), %ymm2
+	prefetcht0	B_PR1+128(BO)
+	vmovups		 -4 * SIZE(BO), %ymm3
+	vmulpd  	%ymm0 ,%ymm1  , %ymm4
+	prefetcht0	B_PR1+192(BO)
+	vmulpd  	%ymm0 ,%ymm2  , %ymm8
+	vmulpd  	%ymm0 ,%ymm3  , %ymm12
+	prefetcht0	B_PR1+256(BO)
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm5
+	vmulpd  	%ymm0 ,%ymm2  , %ymm9
+	vmulpd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm6
+	vmulpd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 12*SIZE, BO
+	vmulpd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmulpd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vmulpd  	%ymm0 ,%ymm3  , %ymm15
+	vmovups		 -4 * SIZE(BO), %ymm3
+
+.endm
+
+.macro KERNEL4x12_M1
+	prefetcht0	A_PR1(AO)
+	vmovups 	-16 * SIZE(AO), %ymm0
+	prefetcht0	B_PR1(BO)
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	prefetcht0	B_PR1+64(BO)
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	prefetcht0	B_PR1+128(BO)
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+	vmovups		 -4 * SIZE(BO), %ymm3
+
+.endm
+
+.macro KERNEL4x12_M2
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		  0 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		  4 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+	vmovups		  8 * SIZE(BO), %ymm3
+	addq		$ 24*SIZE, BO
+.endm
+
+
+.macro KERNEL4x12_E
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+	addq		$ 12*SIZE, BO
+.endm
+
+.macro KERNEL4x12_SUB
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vmovups		 -4 * SIZE(BO), %ymm3
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm12
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	addq		$ 12*SIZE, BO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm13
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+	addq		$ 4*SIZE, AO
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm14
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vfmadd231pd  	%ymm0 ,%ymm3  , %ymm15
+
+.endm
+
+
+.macro SAVE4x12
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+	vmulpd	%ymm0 , %ymm9 , %ymm9
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm11, %ymm11
+
+	vmulpd	%ymm0 , %ymm12, %ymm12
+	vmulpd	%ymm0 , %ymm13, %ymm13
+	vmulpd	%ymm0 , %ymm14, %ymm14
+	vmulpd	%ymm0 , %ymm15, %ymm15
+
+	vpermpd $ 0xb1 , %ymm5, %ymm5
+	vpermpd $ 0xb1 , %ymm7, %ymm7
+
+	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
+	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
+	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
+	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %ymm4, %ymm4
+	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
+	vaddpd 	               (%rax), %ymm6, %ymm6
+	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(%rax)
+	vmovups	%ymm7 ,  	(%rax, LDC)
+
+	prefetcht0	32(CO1)
+	prefetcht0	32(CO1,LDC)
+	prefetcht0	32(%rax)
+	prefetcht0	32(%rax,LDC)
+
+	vpermpd $ 0xb1 , %ymm9 , %ymm9
+	vpermpd $ 0xb1 , %ymm11, %ymm11
+
+	vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
+	vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
+	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
+	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %ymm4, %ymm4
+	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
+	vaddpd 	                (%rbp), %ymm6, %ymm6
+	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(%rax)
+	vmovups	%ymm5 ,  	(%rax, LDC)
+	vmovups	%ymm6 ,  	(%rbp)
+	vmovups	%ymm7 ,  	(%rbp, LDC)
+
+	prefetcht0	32(%rax)
+	prefetcht0	32(%rax,LDC)
+	prefetcht0	32(%rbp)
+	prefetcht0	32(%rbp,LDC)
+
+	vpermpd $ 0xb1 , %ymm13, %ymm13
+	vpermpd $ 0xb1 , %ymm15, %ymm15
+
+	vblendpd $ 0x0a, %ymm13, %ymm12, %ymm0
+	vblendpd $ 0x05, %ymm13, %ymm12, %ymm1
+	vblendpd $ 0x0a, %ymm15, %ymm14, %ymm2
+	vblendpd $ 0x05, %ymm15, %ymm14, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+
+	leaq	(%rax, LDC, 4), %rax
+	leaq	(%rbp, LDC, 4), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %ymm4, %ymm4
+	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
+	vaddpd 	                (%rbp), %ymm6, %ymm6
+	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(%rax)
+	vmovups	%ymm5 ,  	(%rax, LDC)
+	vmovups	%ymm6 ,  	(%rbp)
+	vmovups	%ymm7 ,  	(%rbp, LDC)
+
+	prefetcht0	32(%rax)
+	prefetcht0	32(%rax,LDC)
+	prefetcht0	32(%rbp)
+	prefetcht0	32(%rbp,LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+
+.macro INIT2x12
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+	vxorpd		%xmm12, %xmm12, %xmm12
+	vxorpd		%xmm13, %xmm13, %xmm13
+	vxorpd		%xmm14, %xmm14, %xmm14
+	vxorpd		%xmm15, %xmm15, %xmm15
+
+.endm
+
+.macro KERNEL2x12_SUB
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vmovddup	-10 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
+	vmovddup	 -9 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
+	vmovddup	 -8 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	vmovddup	 -7 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm7
+	vmovddup	 -6 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm8
+	vmovddup	 -5 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm9
+	vmovddup	 -4 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm10
+	vmovddup	 -3 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm11
+	vmovddup	 -2 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm12
+	vmovddup	 -1 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm13
+	addq		$ 12*SIZE, BO
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm14
+	addq		$ 2*SIZE, AO
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm15
+
+.endm
+
+.macro SAVE2x12
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+	vmulpd	%xmm0 , %xmm8 , %xmm8
+	vmulpd	%xmm0 , %xmm9 , %xmm9
+	vmulpd	%xmm0 , %xmm10, %xmm10
+	vmulpd	%xmm0 , %xmm11, %xmm11
+
+	vmulpd	%xmm0 , %xmm12, %xmm12
+	vmulpd	%xmm0 , %xmm13, %xmm13
+	vmulpd	%xmm0 , %xmm14, %xmm14
+	vmulpd	%xmm0 , %xmm15, %xmm15
+
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %xmm4, %xmm4
+	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddpd 	               (%rax), %xmm6, %xmm6
+	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(%rax)
+	vmovups	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %xmm8 , %xmm4
+	vaddpd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddpd 	                (%rbp), %xmm10, %xmm6
+	vaddpd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(%rax)
+	vmovups	%xmm5 ,  	(%rax, LDC)
+	vmovups	%xmm6 ,  	(%rbp)
+	vmovups	%xmm7 ,  	(%rbp, LDC)
+
+
+	leaq	(%rax, LDC, 4), %rax
+	leaq	(%rbp, LDC, 4), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %xmm12, %xmm4
+	vaddpd 	           (%rax, LDC), %xmm13, %xmm5
+	vaddpd 	                (%rbp), %xmm14, %xmm6
+	vaddpd 	           (%rbp, LDC), %xmm15, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(%rax)
+	vmovups	%xmm5 ,  	(%rax, LDC)
+	vmovups	%xmm6 ,  	(%rbp)
+	vmovups	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+
+.macro INIT1x12
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+	vxorpd		%xmm12, %xmm12, %xmm12
+	vxorpd		%xmm13, %xmm13, %xmm13
+	vxorpd		%xmm14, %xmm14, %xmm14
+	vxorpd		%xmm15, %xmm15, %xmm15
+
+.endm
+
+.macro KERNEL1x12_SUB
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vmovsd	-10 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vmovsd	 -9 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	vmovsd	 -8 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
+	vmovsd	 -7 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm7
+	vmovsd	 -6 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm8
+	vmovsd	 -5 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm9
+	vmovsd	 -4 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm10
+	vmovsd	 -3 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm11
+	vmovsd	 -2 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm12
+	vmovsd	 -1 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm13
+	addq		$ 12*SIZE, BO
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm14
+	addq		$ 1*SIZE, AO
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm15
+
+.endm
+
+.macro SAVE1x12
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm7 , %xmm7
+
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+	vmulsd	%xmm0 , %xmm9 , %xmm9
+	vmulsd	%xmm0 , %xmm10, %xmm10
+	vmulsd	%xmm0 , %xmm11, %xmm11
+
+	vmulsd	%xmm0 , %xmm12, %xmm12
+	vmulsd	%xmm0 , %xmm13, %xmm13
+	vmulsd	%xmm0 , %xmm14, %xmm14
+	vmulsd	%xmm0 , %xmm15, %xmm15
+
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddsd 	               (%rax), %xmm6, %xmm6
+	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(%rax)
+	vmovsd	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (%rax), %xmm8 , %xmm4
+	vaddsd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddsd 	                (%rbp), %xmm10, %xmm6
+	vaddsd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(%rax)
+	vmovsd	%xmm5 ,  	(%rax, LDC)
+	vmovsd	%xmm6 ,  	(%rbp)
+	vmovsd	%xmm7 ,  	(%rbp, LDC)
+
+
+	leaq	(%rax, LDC, 4), %rax
+	leaq	(%rbp, LDC, 4), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (%rax), %xmm12, %xmm4
+	vaddsd 	           (%rax, LDC), %xmm13, %xmm5
+	vaddsd 	                (%rbp), %xmm14, %xmm6
+	vaddsd 	           (%rbp, LDC), %xmm15, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(%rax)
+	vmovsd	%xmm5 ,  	(%rax, LDC)
+	vmovsd	%xmm6 ,  	(%rbp)
+	vmovsd	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+
+
+/******************************************************************************************/
+
+
+.macro INIT4x8
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+	vxorpd		%ymm8 , %ymm8 , %ymm8
+	vxorpd		%ymm9 , %ymm9 , %ymm9
+	vxorpd		%ymm10, %ymm10, %ymm10
+	vxorpd		%ymm11, %ymm11, %ymm11
+
+.endm
+
+.macro KERNEL4x8_I
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vmulpd  	%ymm0 ,%ymm1  , %ymm4
+	vmulpd  	%ymm0 ,%ymm2  , %ymm8
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm5
+	vmulpd  	%ymm0 ,%ymm2  , %ymm9
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm6
+	vmulpd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$  8*SIZE, BO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmulpd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+
+.endm
+
+.macro KERNEL4x8_M1
+	prefetcht0	A_PR1(AO)
+	vmovups 	-16 * SIZE(AO), %ymm0
+	prefetcht0	B_PR1(BO)
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	prefetcht0	B_PR1+64(BO)
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		 -8 * SIZE(BO), %ymm2
+
+.endm
+
+.macro KERNEL4x8_M2
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		 -4 * SIZE(BO), %ymm1
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	vmovups		  0 * SIZE(BO), %ymm2
+	addq		$ 16*SIZE, BO
+.endm
+
+
+.macro KERNEL4x8_E
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+
+	addq		$ 8*SIZE, AO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+	addq		$  8*SIZE, BO
+.endm
+
+.macro KERNEL4x8_SUB
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vmovups		 -8 * SIZE(BO), %ymm2
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm8
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm9
+	addq		$  8*SIZE, BO
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm10
+	addq		$ 4*SIZE, AO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm11
+
+.endm
+
+
+.macro SAVE4x8
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+
+	vmulpd	%ymm0 , %ymm8 , %ymm8
+	vmulpd	%ymm0 , %ymm9 , %ymm9
+	vmulpd	%ymm0 , %ymm10, %ymm10
+	vmulpd	%ymm0 , %ymm11, %ymm11
+
+	vpermpd $ 0xb1 , %ymm5, %ymm5
+	vpermpd $ 0xb1 , %ymm7, %ymm7
+
+	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
+	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
+	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
+	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %ymm4, %ymm4
+	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
+	vaddpd 	               (%rax), %ymm6, %ymm6
+	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(%rax)
+	vmovups	%ymm7 ,  	(%rax, LDC)
+
+	prefetcht0	32(CO1)
+	prefetcht0	32(CO1,LDC)
+	prefetcht0	32(%rax)
+	prefetcht0	32(%rax,LDC)
+
+	vpermpd $ 0xb1 , %ymm9 , %ymm9
+	vpermpd $ 0xb1 , %ymm11, %ymm11
+
+	vblendpd $ 0x0a, %ymm9 , %ymm8 , %ymm0
+	vblendpd $ 0x05, %ymm9 , %ymm8 , %ymm1
+	vblendpd $ 0x0a, %ymm11, %ymm10, %ymm2
+	vblendpd $ 0x05, %ymm11, %ymm10, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %ymm4, %ymm4
+	vaddpd 	           (%rax, LDC), %ymm5, %ymm5
+	vaddpd 	                (%rbp), %ymm6, %ymm6
+	vaddpd 	           (%rbp, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(%rax)
+	vmovups	%ymm5 ,  	(%rax, LDC)
+	vmovups	%ymm6 ,  	(%rbp)
+	vmovups	%ymm7 ,  	(%rbp, LDC)
+
+	prefetcht0	32(%rax)
+	prefetcht0	32(%rax,LDC)
+	prefetcht0	32(%rbp)
+	prefetcht0	32(%rbp,LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+
+.macro INIT2x8
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+
+.endm
+
+.macro KERNEL2x8_SUB
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vmovddup	-10 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
+	vmovddup	 -9 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
+	vmovddup	 -8 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	vmovddup	 -7 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm7
+	vmovddup	 -6 * SIZE(BO), %xmm1
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm8
+	vmovddup	 -5 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm9
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm10
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm11
+	addq		$  8*SIZE, BO
+	addq		$ 2*SIZE, AO
+
+.endm
+
+.macro SAVE2x8
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+	vmulpd	%xmm0 , %xmm8 , %xmm8
+	vmulpd	%xmm0 , %xmm9 , %xmm9
+	vmulpd	%xmm0 , %xmm10, %xmm10
+	vmulpd	%xmm0 , %xmm11, %xmm11
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %xmm4, %xmm4
+	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddpd 	               (%rax), %xmm6, %xmm6
+	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(%rax)
+	vmovups	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (%rax), %xmm8 , %xmm4
+	vaddpd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddpd 	                (%rbp), %xmm10, %xmm6
+	vaddpd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(%rax)
+	vmovups	%xmm5 ,  	(%rax, LDC)
+	vmovups	%xmm6 ,  	(%rbp)
+	vmovups	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+
+.macro INIT1x8
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+	vxorpd		%xmm8 , %xmm8 , %xmm8
+	vxorpd		%xmm9 , %xmm9 , %xmm9
+	vxorpd		%xmm10, %xmm10, %xmm10
+	vxorpd		%xmm11, %xmm11, %xmm11
+
+.endm
+
+.macro KERNEL1x8_SUB
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vmovsd	-10 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vmovsd	 -9 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	vmovsd	 -8 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
+	vmovsd	 -7 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm7
+	vmovsd	 -6 * SIZE(BO), %xmm1
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm8
+	vmovsd	 -5 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm9
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm10
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm11
+	addq		$  8*SIZE, BO
+	addq		$ 1*SIZE, AO
+
+.endm
+
+.macro SAVE1x8
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm7 , %xmm7
+
+	vmulsd	%xmm0 , %xmm8 , %xmm8
+	vmulsd	%xmm0 , %xmm9 , %xmm9
+	vmulsd	%xmm0 , %xmm10, %xmm10
+	vmulsd	%xmm0 , %xmm11, %xmm11
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddsd 	               (%rax), %xmm6, %xmm6
+	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(%rax)
+	vmovsd	%xmm7 ,  	(%rax, LDC)
+
+
+	leaq	(%rax, LDC, 2), %rax
+	leaq	(%rax, LDC, 2), %rbp
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (%rax), %xmm8 , %xmm4
+	vaddsd 	           (%rax, LDC), %xmm9 , %xmm5
+	vaddsd 	                (%rbp), %xmm10, %xmm6
+	vaddsd 	           (%rbp, LDC), %xmm11, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(%rax)
+	vmovsd	%xmm5 ,  	(%rax, LDC)
+	vmovsd	%xmm6 ,  	(%rbp)
+	vmovsd	%xmm7 ,  	(%rbp, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+
+
+
+/******************************************************************************************/
+
+.macro INIT4x4
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+
+.endm
+
+.macro KERNEL4x4_I
+	prefetcht0	A_PR1(AO)
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm5
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm6
+
+	addq		$ 4*SIZE, BO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vmulpd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+
+.endm
+
+.macro KERNEL4x4_M1
+	prefetcht0	A_PR1(AO)
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		-12 * SIZE(BO), %ymm1
+
+.endm
+
+.macro KERNEL4x4_M2
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+
+	addq		$ 8*SIZE, AO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	vmovups		 -8 * SIZE(BO), %ymm1
+	addq		$ 8*SIZE, BO
+.endm
+
+
+.macro KERNEL4x4_E
+	vmovups 	-12 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+
+	addq		$ 8*SIZE, AO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+	addq		$ 4*SIZE, BO
+.endm
+
+.macro KERNEL4x4_SUB
+	vmovups		-12 * SIZE(BO), %ymm1
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm4
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm5
+	addq		$ 4*SIZE, BO
+	vpermpd		$ 0x1b, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm6
+	addq		$ 4*SIZE, AO
+	vpermpd		$ 0xb1, %ymm0  , %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm1  , %ymm7
+
+.endm
+
+.macro SAVE4x4
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+	vmulpd	%ymm0 , %ymm7 , %ymm7
+	vmulpd	%ymm0 , %ymm5 , %ymm5
+	vmulpd	%ymm0 , %ymm6 , %ymm6
+
+	vpermpd $ 0xb1 , %ymm5, %ymm5
+	vpermpd $ 0xb1 , %ymm7, %ymm7
+
+	vblendpd $ 0x0a, %ymm5, %ymm4, %ymm0
+	vblendpd $ 0x05, %ymm5, %ymm4, %ymm1
+	vblendpd $ 0x0a, %ymm7, %ymm6, %ymm2
+	vblendpd $ 0x05, %ymm7, %ymm6, %ymm3
+
+	vpermpd $ 0x1b , %ymm2, %ymm2
+	vpermpd $ 0x1b , %ymm3, %ymm3
+	vpermpd $ 0xb1 , %ymm2, %ymm2
+	vpermpd $ 0xb1 , %ymm3, %ymm3
+
+	vblendpd $ 0x03, %ymm0, %ymm2 , %ymm4
+	vblendpd $ 0x03, %ymm1, %ymm3 , %ymm5
+	vblendpd $ 0x03, %ymm2, %ymm0 , %ymm6
+	vblendpd $ 0x03, %ymm3, %ymm1 , %ymm7
+
+        leaq    (CO1, LDC, 2), %rax     
+	
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %ymm4, %ymm4
+	vaddpd 	           (CO1, LDC), %ymm5, %ymm5
+	vaddpd 	               (%rax), %ymm6, %ymm6
+	vaddpd 	          (%rax, LDC), %ymm7, %ymm7
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+	vmovups	%ymm5 ,  	(CO1, LDC)
+	vmovups	%ymm6 ,  	(%rax)
+	vmovups	%ymm7 ,  	(%rax, LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT2x4
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+
+.endm
+
+
+.macro KERNEL2x4_SUB
+	vmovddup	-12 * SIZE(BO), %xmm1
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-11 * SIZE(BO), %xmm2
+	vfmadd231pd  	%xmm0 ,%xmm1  , %xmm4
+	vmovddup	-10 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm5
+	vmovddup	 -9 * SIZE(BO), %xmm8
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	addq		$ 4*SIZE, BO
+	vfmadd231pd  	%xmm0 ,%xmm8  , %xmm7
+	addq		$ 2*SIZE, AO
+
+.endm
+
+
+.macro SAVE2x4
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+        leaq    (CO1, LDC, 2), %rax     
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1), %xmm4, %xmm4
+	vaddpd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddpd 	               (%rax), %xmm6, %xmm6
+	vaddpd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 ,  	(CO1, LDC)
+	vmovups	%xmm6 ,  	(%rax)
+	vmovups	%xmm7 ,  	(%rax, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT1x4
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+
+.endm
+
+
+.macro KERNEL1x4_SUB
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vmovsd	-10 * SIZE(BO), %xmm3
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	vmovsd	 -9 * SIZE(BO), %xmm8
+	vfmadd231sd  	%xmm0 ,%xmm3  , %xmm6
+	addq		$ 4*SIZE, BO
+	vfmadd231sd  	%xmm0 ,%xmm8  , %xmm7
+	addq		$ 1*SIZE, AO
+
+.endm
+
+
+.macro SAVE1x4
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+	vmulsd	%xmm0 , %xmm6 , %xmm6
+	vmulsd	%xmm0 , %xmm7 , %xmm7
+
+        leaq    (CO1, LDC, 2), %rax     
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+	vaddsd 	               (%rax), %xmm6, %xmm6
+	vaddsd 	          (%rax, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+	vmovsd	%xmm6 ,  	(%rax)
+	vmovsd	%xmm7 ,  	(%rax, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT4x2
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+	vxorpd		%xmm7 , %xmm7 , %xmm7
+
+.endm
+
+
+.macro KERNEL4x2_SUB
+	vmovddup	-12 * SIZE(BO), %xmm2
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovups 	-14 * SIZE(AO), %xmm1
+	vmovddup	-11 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
+	vfmadd231pd  	%xmm1 ,%xmm2  , %xmm5
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	vfmadd231pd  	%xmm1 ,%xmm3  , %xmm7
+	addq		$ 2*SIZE, BO
+	addq		$ 4*SIZE, AO
+
+.endm
+
+
+.macro SAVE4x2
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm5 , %xmm5
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+	vmulpd	%xmm0 , %xmm7 , %xmm7
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %xmm4, %xmm4
+	vaddpd 	        2 * SIZE(CO1)     , %xmm5, %xmm5
+	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
+	vaddpd 	        2 * SIZE(CO1, LDC), %xmm7, %xmm7
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm5 , 2 * SIZE(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+	vmovups	%xmm7 , 2 * SIZE(CO1, LDC)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT2x2
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm6 , %xmm6 , %xmm6
+
+.endm
+
+
+.macro KERNEL2x2_SUB
+	vmovddup	-12 * SIZE(BO), %xmm2
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vmovddup	-11 * SIZE(BO), %xmm3
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
+	vfmadd231pd  	%xmm0 ,%xmm3  , %xmm6
+	addq		$ 2*SIZE, BO
+	addq		$ 2*SIZE, AO
+
+.endm
+
+
+.macro SAVE2x2
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+	vmulpd	%xmm0 , %xmm6 , %xmm6
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %xmm4, %xmm4
+	vaddpd 	                (CO1, LDC), %xmm6, %xmm6
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+	vmovups	%xmm6 ,  	(CO1, LDC)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT1x2
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+	vxorpd		%xmm5 , %xmm5 , %xmm5
+
+.endm
+
+
+.macro KERNEL1x2_SUB
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vmovsd	-11 * SIZE(BO), %xmm2
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	vfmadd231sd  	%xmm0 ,%xmm2  , %xmm5
+	addq		$ 2*SIZE, BO
+	addq		$ 1*SIZE, AO
+
+.endm
+
+
+.macro SAVE1x2
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+	vmulsd	%xmm0 , %xmm5 , %xmm5
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+	vaddsd 	           (CO1, LDC), %xmm5, %xmm5
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+	vmovsd	%xmm5 ,  	(CO1, LDC)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT4x1
+
+	vxorpd		%ymm4 , %ymm4 , %ymm4
+	vxorpd		%ymm5 , %ymm5 , %ymm5
+	vxorpd		%ymm6 , %ymm6 , %ymm6
+	vxorpd		%ymm7 , %ymm7 , %ymm7
+
+.endm
+
+
+.macro KERNEL4x1
+
+	vbroadcastsd	-12 * SIZE(BO), %ymm0
+	vbroadcastsd	-11 * SIZE(BO), %ymm1
+	vbroadcastsd	-10 * SIZE(BO), %ymm2
+	vbroadcastsd	-9  * SIZE(BO), %ymm3
+
+	vfmadd231pd  	-16 * SIZE(AO) ,%ymm0  , %ymm4
+	vfmadd231pd  	-12 * SIZE(AO) ,%ymm1  , %ymm5
+
+	vbroadcastsd	-8  * SIZE(BO), %ymm0
+	vbroadcastsd	-7  * SIZE(BO), %ymm1
+
+	vfmadd231pd  	-8  * SIZE(AO) ,%ymm2  , %ymm6
+	vfmadd231pd  	-4  * SIZE(AO) ,%ymm3  , %ymm7
+
+	vbroadcastsd	-6  * SIZE(BO), %ymm2
+	vbroadcastsd	-5  * SIZE(BO), %ymm3
+
+	vfmadd231pd  	 0  * SIZE(AO) ,%ymm0  , %ymm4
+	vfmadd231pd  	 4  * SIZE(AO) ,%ymm1  , %ymm5
+	vfmadd231pd  	 8  * SIZE(AO) ,%ymm2  , %ymm6
+	vfmadd231pd  	 12 * SIZE(AO) ,%ymm3  , %ymm7
+
+	addq		$ 8 *SIZE, BO
+	addq		$ 32*SIZE, AO
+
+.endm
+
+
+.macro KERNEL4x1_SUB
+	vbroadcastsd	-12 * SIZE(BO), %ymm2
+	vmovups 	-16 * SIZE(AO), %ymm0
+	vfmadd231pd  	%ymm0 ,%ymm2  , %ymm4
+	addq		$ 1*SIZE, BO
+	addq		$ 4*SIZE, AO
+
+.endm
+
+
+.macro SAVE4x1
+
+	vbroadcastsd	ALPHA, %ymm0
+
+	vaddpd	%ymm4,%ymm5, %ymm4 
+	vaddpd	%ymm6,%ymm7, %ymm6 
+	vaddpd	%ymm4,%ymm6, %ymm4 
+
+	vmulpd	%ymm0 , %ymm4 , %ymm4
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %ymm4, %ymm4
+
+#endif
+
+	vmovups	%ymm4 ,  	(CO1)
+
+	addq	$ 4*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT2x1
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+
+.endm
+
+
+.macro KERNEL2x1_SUB
+	vmovddup	-12 * SIZE(BO), %xmm2
+	vmovups 	-16 * SIZE(AO), %xmm0
+	vfmadd231pd  	%xmm0 ,%xmm2  , %xmm4
+	addq		$ 1*SIZE, BO
+	addq		$ 2*SIZE, AO
+
+.endm
+
+
+.macro SAVE2x1
+
+	vmovddup	ALPHA, %xmm0
+
+	vmulpd	%xmm0 , %xmm4 , %xmm4
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddpd 	                (CO1)     , %xmm4, %xmm4
+
+#endif
+
+	vmovups	%xmm4 ,  	(CO1)
+
+	addq	$ 2*SIZE, CO1
+.endm
+
+
+/******************************************************************************************/
+/******************************************************************************************/
+
+.macro INIT1x1
+
+	vxorpd		%xmm4 , %xmm4 , %xmm4
+
+.endm
+
+
+.macro KERNEL1x1_SUB
+	vmovsd	-12 * SIZE(BO), %xmm1
+	vmovsd 	-16 * SIZE(AO), %xmm0
+	vfmadd231sd  	%xmm0 ,%xmm1  , %xmm4
+	addq		$ 1*SIZE, BO
+	addq		$ 1*SIZE, AO
+
+.endm
+
+
+.macro SAVE1x1
+
+	vmovsd	ALPHA, %xmm0
+
+	vmulsd	%xmm0 , %xmm4 , %xmm4
+
+
+#if !defined(TRMMKERNEL)
+
+	vaddsd 	                (CO1), %xmm4, %xmm4
+
+#endif
+
+	vmovsd	%xmm4 ,  	(CO1)
+
+	addq	$ 1*SIZE, CO1
+.endm
+
+
+/*******************************************************************************************/
+
+#if !defined(TRMMKERNEL)
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+
+	vmovups	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $24,  %rdi
+        divq    %rdi                     //    N / 24
+        movq    %rax, Ndiv12             //    N / 24
+        movq    %rdx, Nmod12             //    N % 24
+
+
+	movq	Ndiv12,  J
+	cmpq	$ 0, J
+	je	.L8_0
+	ALIGN_4
+
+.L12_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $3,%rax                 // K * 8 ; read 8 values from BO1
+        movq    B, BO1
+        leaq    (B,%rax, SIZE), BO2     // next offset to BO2
+	movq	BO2 , B			
+
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+
+        ALIGN_4
+
+.L12_02b:
+
+	vmovups	0 * SIZE(BO1), %ymm1
+	vmovups	4 * SIZE(BO1), %ymm2
+	vmovups	0 * SIZE(BO2), %ymm3
+	vmovups	%ymm1, 0 * SIZE(BO)
+	vmovups	%ymm2, 4 * SIZE(BO)
+	vmovups	%ymm3, 8 * SIZE(BO)
+	addq	$ 8*SIZE,BO1
+	addq	$ 8*SIZE,BO2
+	addq	$ 12*SIZE,BO
+	decq	%rax
+	jnz	.L12_02b
+
+.L12_03c:
+
+
+.L12_10:
+	movq	C, CO1
+	leaq	(C, LDC, 8), C		 
+	leaq	(C, LDC, 4), C		// c += 12 * ldc
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L12_20
+
+	ALIGN_4
+
+.L12_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq $3, %rax			//  K / 8
+	cmpq $2, %rax
+
+	jl	.L12_13
+
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	subq $2, %rax
+	je	.L12_12a
+
+	ALIGN_5
+.L12_12:
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	dec	%rax
+	jne	.L12_12
+
+.L12_12a:
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L12_16
+
+
+.L12_13:
+
+	test $1, %rax
+	jz .L12_14
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L12_16
+
+
+.L12_14:
+
+	INIT4x12
+
+
+.L12_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L12_19
+
+	ALIGN_4
+
+.L12_17:
+
+	KERNEL4x12_SUB
+
+	dec	%rax
+	jne	.L12_17
+	ALIGN_4
+
+
+.L12_19:
+
+	SAVE4x12
+
+	decq	I			# i --
+	jne	.L12_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L12_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L12_100			// to next 16 lines of N
+
+
+.L12_30:
+	testq	$2, M		
+	jz	.L12_40
+
+	ALIGN_4
+
+.L12_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x12
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L12_36
+	ALIGN_4
+
+.L12_32:
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L12_32
+	ALIGN_4
+
+.L12_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L12_39
+
+	ALIGN_4
+
+.L12_37:
+
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L12_37
+	ALIGN_4
+
+
+.L12_39:
+
+	SAVE2x12
+
+	ALIGN_4
+
+.L12_40:
+	testq	$1, M		
+	jz	.L12_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L12_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x12
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L12_46
+
+	ALIGN_4
+
+.L12_42:
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+
+	dec %rax
+	jne	.L12_42
+	ALIGN_4
+
+.L12_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L12_49
+
+	ALIGN_4
+
+.L12_47:
+
+	KERNEL1x12_SUB
+
+	dec	%rax
+	jne	.L12_47
+	ALIGN_4
+
+
+.L12_49:
+
+	SAVE1x12
+
+	ALIGN_4
+	
+.L12_100:
+
+
+
+/**************************************************************************************************/
+
+.L13_01:
+        // copy to sub buffer
+        movq    K, %rax
+        salq    $3,%rax                 // K * 8 ; read 8 values
+        movq    B, BO2
+        leaq    (B,%rax, SIZE), BO3     // next offset to BO2
+        leaq    (BO3,%rax, SIZE), B     // next offset to B
+
+
+        leaq    BUFFER1, BO             // first buffer to BO
+        movq    K, %rax
+
+        ALIGN_4
+
+
+.L13_02b:
+
+	vmovups	4 * SIZE(BO2), %ymm1
+	vmovups	0 * SIZE(BO3), %ymm2
+	vmovups	4 * SIZE(BO3), %ymm3
+	vmovups	%ymm1, 0 * SIZE(BO)
+	vmovups	%ymm2, 4 * SIZE(BO)
+	vmovups	%ymm3, 8 * SIZE(BO)
+	addq	$ 8*SIZE,BO2
+	addq	$ 8*SIZE,BO3
+	addq	$ 12*SIZE,BO
+	decq	%rax
+	jnz	.L13_02b
+
+
+
+.L13_10:
+	movq	C, CO1
+	leaq	(C, LDC, 8), C		 
+	leaq	(C, LDC, 4), C		// c += 12 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L13_20
+
+	ALIGN_4
+
+.L13_11:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq $3, %rax			//  K / 8
+	cmpq $2, %rax
+
+	jl	.L13_13
+
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	subq $2, %rax
+	je	.L13_12a
+
+	ALIGN_5
+.L13_12:
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	dec	%rax
+	jne	.L13_12
+
+.L13_12a:
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L13_16
+
+
+.L13_13:
+
+	test $1, %rax
+	jz .L13_14
+
+	KERNEL4x12_I
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+
+	KERNEL4x12_M1
+	KERNEL4x12_M2
+	KERNEL4x12_M1
+	KERNEL4x12_E
+
+	jmp .L13_16
+
+
+.L13_14:
+
+	INIT4x12
+
+
+.L13_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L13_19
+
+	ALIGN_4
+
+.L13_17:
+
+	KERNEL4x12_SUB
+
+	dec	%rax
+	jne	.L13_17
+	ALIGN_4
+
+
+.L13_19:
+
+	SAVE4x12
+
+	decq	I			# i --
+	jne	.L13_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L13_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L13_100			// to next 16 lines of N
+
+
+.L13_30:
+	testq	$2, M		
+	jz	.L13_40
+
+	ALIGN_4
+
+.L13_31:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x12
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L13_36
+	ALIGN_4
+
+.L13_32:
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L13_32
+	ALIGN_4
+
+.L13_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L13_39
+
+	ALIGN_4
+
+.L13_37:
+
+	KERNEL2x12_SUB
+
+	dec %rax
+	jne	.L13_37
+	ALIGN_4
+
+
+.L13_39:
+
+	SAVE2x12
+
+	ALIGN_4
+
+.L13_40:
+	testq	$1, M		
+	jz	.L13_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L13_41:
+        leaq    BUFFER1, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x12
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L13_46
+
+	ALIGN_4
+
+.L13_42:
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+	KERNEL1x12_SUB
+
+
+	dec %rax
+	jne	.L13_42
+	ALIGN_4
+
+.L13_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L13_49
+
+	ALIGN_4
+
+.L13_47:
+
+	KERNEL1x12_SUB
+
+	dec	%rax
+	jne	.L13_47
+	ALIGN_4
+
+
+.L13_49:
+
+	SAVE1x12
+
+	ALIGN_4
+	
+.L13_100:
+
+	decq	J			// j --
+	jg	.L12_01
+
+
+
+
+/**************************************************************************************************/
+
+.L8_0:
+
+	cmpq	$ 0, Nmod12		// N % 12 == 0
+	je	.L999
+
+	movq	Nmod12, J		
+	sarq	$3, J			// j = j / 8
+	je	.L4_0
+
+.L8_10:
+	movq	C, CO1
+	leaq	(C, LDC, 8), C		// c += 4 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L8_20
+
+	ALIGN_4
+
+.L8_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L8_13
+
+
+	KERNEL4x8_I
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subq $2, %rax
+	je	.L8_12a
+
+	ALIGN_5
+
+.L8_12:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	dec	%rax
+	jne	.L8_12
+
+.L8_12a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	jmp .L8_16
+
+
+.L8_13:
+
+	test $1, %rax
+	jz .L8_14
+
+	KERNEL4x8_I
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	jmp .L8_16
+
+
+.L8_14:
+
+	INIT4x8
+
+
+.L8_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_19
+
+	ALIGN_4
+
+.L8_17:
+
+	KERNEL4x8_SUB
+
+	dec	%rax
+	jne	.L8_17
+	ALIGN_4
+
+
+.L8_19:
+
+	SAVE4x8
+
+	decq	I			# i --
+	jg	.L8_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L8_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L8_100			// to next 16 lines of N
+
+
+.L8_30:
+	testq	$2, M		
+	jz	.L8_40
+
+	ALIGN_4
+
+.L8_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x8
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L8_36
+	ALIGN_4
+
+.L8_32:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	dec %rax
+	jne	.L8_32
+	ALIGN_4
+
+.L8_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_39
+
+	ALIGN_4
+
+.L8_37:
+
+	KERNEL2x8_SUB
+
+	dec %rax
+	jne	.L8_37
+
+
+.L8_39:
+
+	SAVE2x8
+
+.L8_40:
+	testq	$1, M		
+	jz	.L8_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L8_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x8
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L8_46
+
+	ALIGN_4
+
+.L8_42:
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	dec %rax
+	jne	.L8_42
+	ALIGN_4
+
+.L8_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_49
+
+	ALIGN_4
+
+.L8_47:
+
+	KERNEL1x8_SUB
+
+	dec	%rax
+	jne	.L8_47
+	ALIGN_4
+
+
+.L8_49:
+
+	SAVE1x8
+
+	ALIGN_4
+	
+.L8_100:
+
+	movq	K, %rax
+	salq	$3, %rax		// * 8
+	leaq	(B , %rax, SIZE), B
+	decq	J			// j --
+	jg	.L8_10
+
+
+
+/**************************************************************************************************/
+
+.L4_0:
+
+	cmpq	$ 0, Nmod12		// N % 12 == 0
+	je	.L999
+
+	movq	Nmod12, J		
+	testq   $4, J			// j = j / 4
+	je	.L2_0
+
+.L4_10:
+	movq	C, CO1
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+        movq    K, %rax
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L4_13
+
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subq $2, %rax
+	je	.L4_12a
+
+	ALIGN_5
+
+.L4_12:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	dec	%rax
+	jne	.L4_12
+
+.L4_12a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_13:
+
+	test $1, %rax
+	jz .L4_14
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_14:
+
+	INIT4x4
+
+
+.L4_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL4x4_SUB
+
+	dec	%rax
+	jne	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE4x4
+
+	decq	I			# i --
+	jg	.L4_11
+
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L4_100			// to next 16 lines of N
+
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x4
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L4_36
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_32
+	ALIGN_4
+
+.L4_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_37
+
+
+.L4_39:
+
+	SAVE2x4
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L4_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x4
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L4_46
+
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	dec %rax
+	jne	.L4_42
+	ALIGN_4
+
+.L4_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	dec	%rax
+	jne	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+	ALIGN_4
+	
+.L4_100:
+
+	movq	K, %rax
+	salq	$2, %rax		// * 4
+	leaq	(B , %rax, SIZE), B
+
+
+
+
+/***************************************************************************************************************/
+
+.L2_0:
+
+	movq	Nmod12, J		
+	testq	$2, J
+	je	.L1_0
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+	INIT4x2
+
+        movq    K, %rax
+	sarq $3, %rax			//  K / 8
+
+	je	.L2_16
+
+	ALIGN_5
+
+.L2_12:
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_12
+
+
+.L2_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE4x2
+
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L2_100			// to next 16 lines of N
+
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x2
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L2_36
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_32
+
+.L2_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_37
+
+
+.L2_39:
+
+	SAVE2x2
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_100		// to next 3 lines of N
+
+.L2_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x2
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L2_46
+
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	dec %rax
+	jne	.L2_42
+
+.L2_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	dec	%rax
+	jne	.L2_47
+
+.L2_49:
+
+	SAVE1x2
+
+.L2_100:
+
+	movq	K, %rax
+	salq	$1, %rax		// * 2
+	leaq	(B , %rax, SIZE), B
+
+/***************************************************************************************************************/
+
+.L1_0:
+
+	movq	Nmod12, J		
+	testq	$1, J
+	je	.L999
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+
+	INIT4x1
+
+        movq    K, %rax
+
+	sarq	$3, %rax			//  K / 8
+	je	.L1_16
+
+	ALIGN_5
+
+.L1_12:
+
+	KERNEL4x1
+
+	dec	%rax
+	jne	.L1_12
+
+
+.L1_16:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL4x1_SUB
+
+	dec	%rax
+	jne	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE4x1
+
+	decq	I			# i --
+	jg	.L1_11
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L1_100	
+
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT2x1
+
+        movq    K, %rax
+
+	sarq	$3, %rax
+	je	.L1_36
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+
+	dec %rax
+	jne	.L1_32
+
+.L1_36:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	dec %rax
+	jne	.L1_37
+
+.L1_39:
+
+	SAVE2x1
+
+.L1_40:
+	testq	$1, M		
+	jz	.L1_100		// to next 3 lines of N
+
+
+.L1_41:
+        movq    B, BO             // first buffer to BO
+        addq    $12 * SIZE, BO
+
+	INIT1x1
+
+        movq    K, %rax
+
+	sarq	$3,%rax
+	je	.L1_46
+
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	dec %rax
+	jne	.L1_42
+
+.L1_46:
+        movq    K, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	dec	%rax
+	jne	.L1_47
+
+
+.L1_49:
+
+	SAVE1x1
+
+.L1_100:
+
+
+
+
+.L999:
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+#else
+/*************************************************************************************
+* TRMM Kernel
+*************************************************************************************/
+
+
+	PROLOGUE
+	PROFCODE
+	
+	subq	$STACKSIZE, %rsp
+	movq	%rbx,   (%rsp)
+	movq	%rbp,  8(%rsp)
+	movq	%r12, 16(%rsp)
+	movq	%r13, 24(%rsp)
+	movq	%r14, 32(%rsp)
+	movq	%r15, 40(%rsp)
+
+	vzeroupper
+
+#ifdef WINDOWS_ABI
+	movq	%rdi,    48(%rsp)
+	movq	%rsi,    56(%rsp)
+	vmovups	%xmm6,   64(%rsp)
+	vmovups	%xmm7,   80(%rsp)
+	vmovups	%xmm8,   96(%rsp)
+	vmovups	%xmm9,  112(%rsp)
+	vmovups	%xmm10, 128(%rsp)
+	vmovups	%xmm11, 144(%rsp)
+	vmovups	%xmm12, 160(%rsp)
+	vmovups	%xmm13, 176(%rsp)
+	vmovups	%xmm14, 192(%rsp)
+	vmovups	%xmm15, 208(%rsp)
+
+	movq	ARG1,      OLD_M
+	movq	ARG2,      OLD_N
+	movq	ARG3,      OLD_K
+	movq	OLD_A,     A
+	movq	OLD_B,     B
+	movq	OLD_C,     C
+	movq	OLD_LDC,   LDC
+#ifdef TRMMKERNEL
+	vmovsd	OLD_OFFSET, %xmm12
+#endif
+	vmovups	%xmm3, %xmm0
+
+#else
+	movq	STACKSIZE +  8(%rsp), LDC
+#ifdef TRMMKERNEL
+	vmovsd	STACKSIZE + 16(%rsp), %xmm12
+#endif
+
+#endif
+
+	movq    %rsp, SP      # save old stack
+        subq    $128 + L_BUFFER_SIZE, %rsp
+        andq    $-4096, %rsp    # align stack
+
+        STACK_TOUCH
+
+	cmpq	$ 0, OLD_M
+	je	.L999
+
+	cmpq	$ 0, OLD_N
+	je	.L999
+
+	cmpq	$ 0, OLD_K
+	je	.L999
+
+	movq	OLD_M, M
+	movq	OLD_N, N
+	movq	OLD_K, K
+
+	vmovsd	 %xmm0, ALPHA
+
+	salq	$BASE_SHIFT, LDC
+
+	movq    N, %rax
+        xorq    %rdx, %rdx
+        movq    $8,  %rdi
+        divq    %rdi                     //    N / 8
+        movq    %rax, Ndiv12             //    N / 8
+        movq    %rdx, Nmod12             //    N % 8
+
+#ifdef TRMMKERNEL
+        vmovsd  %xmm12, OFFSET
+        vmovsd  %xmm12, KK
+#ifndef LEFT
+        negq    KK
+#endif  
+#endif
+
+/*************************************************************************************************/
+.L8_0:
+	movq	Ndiv12,  J
+	cmpq	$ 0, J
+	je	.L4_0
+	ALIGN_4
+
+.L8_10:
+	movq	C, CO1
+	leaq	(C, LDC, 8), C		// c += 8 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L8_20
+
+	ALIGN_4
+
+.L8_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,8), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $8, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L8_13
+
+
+	KERNEL4x8_I
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	subq $2, %rax
+	je	.L8_12a
+
+	ALIGN_5
+
+.L8_12:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	dec	%rax
+	jne	.L8_12
+
+.L8_12a:
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	jmp .L8_16
+
+
+.L8_13:
+
+	test $1, %rax
+	jz .L8_14
+
+	KERNEL4x8_I
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+
+	KERNEL4x8_M1
+	KERNEL4x8_M2
+	KERNEL4x8_M1
+	KERNEL4x8_E
+
+	jmp .L8_16
+
+
+.L8_14:
+
+	INIT4x8
+
+
+.L8_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_19
+
+	ALIGN_4
+
+.L8_17:
+
+	KERNEL4x8_SUB
+
+	dec	%rax
+	jne	.L8_17
+	ALIGN_4
+
+
+.L8_19:
+
+	SAVE4x8
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 8), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+	decq	I			# i --
+	jg	.L8_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L8_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L8_100			// to next 16 lines of N
+
+
+.L8_30:
+	testq	$2, M		
+	jz	.L8_40
+
+	ALIGN_4
+
+.L8_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,8), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $8, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x8
+
+	sarq	$3, %rax
+	je	.L8_36
+	ALIGN_4
+
+.L8_32:
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+	KERNEL2x8_SUB
+
+	dec %rax
+	jne	.L8_32
+	ALIGN_4
+
+.L8_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_39
+
+	ALIGN_4
+
+.L8_37:
+
+	KERNEL2x8_SUB
+
+	dec %rax
+	jne	.L8_37
+
+
+.L8_39:
+
+	SAVE2x8
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 8), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L8_40:
+	testq	$1, M		
+	jz	.L8_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L8_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,8), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $8, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x8
+
+	sarq	$3,%rax
+	je	.L8_46
+
+	ALIGN_4
+
+.L8_42:
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+	KERNEL1x8_SUB
+
+	dec %rax
+	jne	.L8_42
+	ALIGN_4
+
+.L8_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L8_49
+
+	ALIGN_4
+
+.L8_47:
+
+	KERNEL1x8_SUB
+
+	dec	%rax
+	jne	.L8_47
+	ALIGN_4
+
+
+.L8_49:
+
+	SAVE1x8
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 8), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+.L8_100:
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $8, KK				// number of values in B
+#endif
+
+
+	decq	J			// j --
+	jg	.L8_10
+
+
+
+
+
+/*************************************************************************************************/
+.L4_0:
+	movq	Nmod12, J		
+	testq	$4, J
+	je	.L2_0
+	ALIGN_4
+
+.L4_10:
+	movq	C, CO1
+	leaq	(C, LDC, 4), C		// c += 4 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L4_20
+
+	ALIGN_4
+
+.L4_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,4), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	sarq	$3, %rax			//  K / 8
+	cmpq    $2, %rax
+	jl	.L4_13
+
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subq $2, %rax
+	je	.L4_12a
+
+	ALIGN_5
+
+.L4_12:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	dec	%rax
+	jne	.L4_12
+
+.L4_12a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_13:
+
+	test $1, %rax
+	jz .L4_14
+
+	KERNEL4x4_I
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	jmp .L4_16
+
+
+.L4_14:
+
+	INIT4x4
+
+
+.L4_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_19
+
+	ALIGN_4
+
+.L4_17:
+
+	KERNEL4x4_SUB
+
+	dec	%rax
+	jne	.L4_17
+	ALIGN_4
+
+
+.L4_19:
+
+	SAVE4x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 4), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+	decq	I			# i --
+	jg	.L4_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L4_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L4_100			// to next 16 lines of N
+
+
+.L4_30:
+	testq	$2, M		
+	jz	.L4_40
+
+	ALIGN_4
+
+.L4_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,4), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x4
+
+	sarq	$3, %rax
+	je	.L4_36
+	ALIGN_4
+
+.L4_32:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_32
+	ALIGN_4
+
+.L4_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_39
+
+	ALIGN_4
+
+.L4_37:
+
+	KERNEL2x4_SUB
+
+	dec %rax
+	jne	.L4_37
+
+
+.L4_39:
+
+	SAVE2x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 4), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L4_40:
+	testq	$1, M		
+	jz	.L4_100		// to next 3 lines of N
+
+	ALIGN_4
+
+.L4_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,4), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $4, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x4
+
+	sarq	$3,%rax
+	je	.L4_46
+
+	ALIGN_4
+
+.L4_42:
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	dec %rax
+	jne	.L4_42
+	ALIGN_4
+
+.L4_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L4_49
+
+	ALIGN_4
+
+.L4_47:
+
+	KERNEL1x4_SUB
+
+	dec	%rax
+	jne	.L4_47
+	ALIGN_4
+
+
+.L4_49:
+
+	SAVE1x4
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 4), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+.L4_100:
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $4, KK				// number of values in B
+#endif
+
+
+	movq	K, %rax
+	salq	$2, %rax		// * 4
+	leaq	(B , %rax, SIZE), B
+
+
+
+
+/***************************************************************************************************************/
+
+.L2_0:
+
+	movq	Nmod12, J		
+	testq	$2, J
+	je	.L1_0
+
+.L2_10:
+	movq	C, CO1
+	leaq	(C, LDC, 2), C		// c += 2 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+
+
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L2_20
+
+	ALIGN_4
+
+.L2_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,2), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT4x2
+
+	sarq $3, %rax			//  K / 8
+
+	je	.L2_16
+
+	ALIGN_5
+
+.L2_12:
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_12
+
+
+.L2_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_19
+
+	ALIGN_4
+
+.L2_17:
+
+	KERNEL4x2_SUB
+
+	dec	%rax
+	jne	.L2_17
+	ALIGN_4
+
+
+.L2_19:
+
+	SAVE4x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 2), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+
+	decq	I			# i --
+	jg	.L2_11
+	ALIGN_4	
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L2_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L2_100			// to next 16 lines of N
+
+
+.L2_30:
+	testq	$2, M		
+	jz	.L2_40
+
+	ALIGN_4
+
+.L2_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,2), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x2
+
+	sarq	$3, %rax
+	je	.L2_36
+	ALIGN_4
+
+.L2_32:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_32
+
+.L2_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_39
+
+	ALIGN_4
+
+.L2_37:
+
+	KERNEL2x2_SUB
+
+	dec %rax
+	jne	.L2_37
+
+
+.L2_39:
+
+	SAVE2x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax + SIZE
+        leaq    (BO, %rax, 2), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L2_40:
+	testq	$1, M		
+	jz	.L2_100		// to next 3 lines of N
+
+.L2_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,2), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $2, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x2
+
+	sarq	$3,%rax
+	je	.L2_46
+
+	ALIGN_4
+
+.L2_42:
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	dec %rax
+	jne	.L2_42
+
+.L2_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L2_49
+
+	ALIGN_4
+
+.L2_47:
+
+	KERNEL1x2_SUB
+
+	dec	%rax
+	jne	.L2_47
+
+.L2_49:
+
+	SAVE1x2
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 2), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+
+.L2_100:
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $2, KK				// number of values in B
+#endif
+
+	movq	K, %rax
+	salq	$1, %rax		// * 2
+	leaq	(B , %rax, SIZE), B
+
+/***************************************************************************************************************/
+
+.L1_0:
+
+	movq	Nmod12, J		
+	testq	$1, J
+	je	.L999
+
+.L1_10:
+	movq	C, CO1
+	leaq	(C, LDC, 1), C		// c += 1 * ldc
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        movq    OFFSET, %rax
+        movq    %rax, KK
+#endif
+	
+	movq	A, AO		 	// aoffset = a
+	addq	$16 * SIZE, AO
+
+	movq	M,  I
+	sarq	$2, I			// i = m / 4
+	je	.L1_20
+
+	ALIGN_4
+
+.L1_11:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,1), BO		// add number of values in B
+	leaq	(AO,%rax,4), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $4, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT4x1
+
+	sarq	$3, %rax			//  K / 8
+	je	.L1_16
+
+	ALIGN_5
+
+.L1_12:
+
+	KERNEL4x1
+
+	dec	%rax
+	jne	.L1_12
+
+
+.L1_16:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_19
+
+	ALIGN_4
+
+.L1_17:
+
+	KERNEL4x1_SUB
+
+	dec	%rax
+	jne	.L1_17
+	ALIGN_4
+
+
+.L1_19:
+
+	SAVE4x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 1), BO		// number of values in B
+        leaq    (AO, %rax, 4), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $4, KK				// number of values in A
+#endif
+
+
+	decq	I			# i --
+	jg	.L1_11
+
+/**************************************************************************
+* Rest of M 
+***************************************************************************/
+.L1_20:
+	// Test rest of M
+
+	testq	$3, M
+	jz	.L1_100	
+
+
+.L1_30:
+	testq	$2, M		
+	jz	.L1_40
+
+	ALIGN_4
+
+.L1_31:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,1), BO		// add number of values in B
+	leaq	(AO,%rax,2), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $2, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT2x1
+
+	sarq	$3, %rax
+	je	.L1_36
+	ALIGN_4
+
+.L1_32:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+
+	dec %rax
+	jne	.L1_32
+
+.L1_36:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_39
+
+	ALIGN_4
+
+.L1_37:
+
+	KERNEL2x1_SUB
+
+	dec %rax
+	jne	.L1_37
+
+.L1_39:
+
+	SAVE2x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 1), BO		// number of values in B
+        leaq    (AO, %rax, 2), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $2, KK				// number of values in A
+#endif
+
+
+.L1_40:
+	testq	$1, M		
+	jz	.L1_100		// to next 3 lines of N
+
+
+.L1_41:
+
+#if !defined(TRMMKERNEL) || \
+        (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+        (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+#else
+        movq    B, BO        
+        addq    $12 * SIZE, BO
+        movq    KK, %rax
+	salq	$3, %rax		// rax * SIZE
+	leaq	(BO,%rax,1), BO		// add number of values in B
+	leaq	(AO,%rax,1), AO		// add number of values in A
+#endif
+
+
+#ifndef TRMMKERNEL
+        movq    K, %rax
+#elif (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+        movq    K, %rax
+        subq    KK, %rax
+        movq    %rax, KKK
+#else
+        movq    KK, %rax
+#ifdef LEFT
+        addq    $1, %rax        // number of values in AO
+#else
+        addq    $1, %rax        // number of values in BO
+#endif
+        movq    %rax, KKK
+#endif
+
+	INIT1x1
+
+	sarq	$3,%rax
+	je	.L1_46
+
+	ALIGN_4
+
+.L1_42:
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	dec %rax
+	jne	.L1_42
+
+.L1_46:
+        movq    KKK, %rax
+
+	andq	$7, %rax		# if (k & 1)
+	je .L1_49
+
+	ALIGN_4
+
+.L1_47:
+
+	KERNEL1x1_SUB
+
+	dec	%rax
+	jne	.L1_47
+
+
+.L1_49:
+
+	SAVE1x1
+
+#if (defined(TRMMKERNEL) &&  defined(LEFT) &&  defined(TRANSA)) || \
+    (defined(TRMMKERNEL) && !defined(LEFT) && !defined(TRANSA))
+        movq    K, %rax
+        subq    KKK, %rax
+	salq	$3, %rax			// rax * SIZE
+        leaq    (BO, %rax, 1), BO		// number of values in B
+        leaq    (AO, %rax, 1), AO		// number of values in A
+#endif
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+        addq    $1, KK				// number of values in A
+#endif
+
+
+
+.L1_100:
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+        addq    $1, KK				// number of values in B
+#endif
+
+
+
+.L999:
+
+	vzeroupper
+
+	movq   		SP, %rsp
+	movq	   (%rsp), %rbx
+	movq	  8(%rsp), %rbp
+	movq	 16(%rsp), %r12
+	movq	 24(%rsp), %r13
+	movq	 32(%rsp), %r14
+	movq	 40(%rsp), %r15
+
+#ifdef WINDOWS_ABI
+	movq	 48(%rsp), %rdi
+	movq	 56(%rsp), %rsi
+	vmovups	 64(%rsp), %xmm6
+	vmovups	 80(%rsp), %xmm7
+	vmovups	 96(%rsp), %xmm8
+	vmovups	112(%rsp), %xmm9
+	vmovups	128(%rsp), %xmm10
+	vmovups	144(%rsp), %xmm11
+	vmovups	160(%rsp), %xmm12
+	vmovups	176(%rsp), %xmm13
+	vmovups	192(%rsp), %xmm14
+	vmovups	208(%rsp), %xmm15
+#endif
+
+	addq	$STACKSIZE, %rsp
+	ret
+
+	EPILOGUE
+
+
+
+
+
+#endif
diff --git a/kernel/x86_64/dtrmm_kernel_4x8_haswell.c b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c
new file mode 100644
index 000000000..504c784ac
--- /dev/null
+++ b/kernel/x86_64/dtrmm_kernel_4x8_haswell.c
@@ -0,0 +1,1546 @@
+#include "common.h"
+#include <stdbool.h>
+
+
+static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7) __attribute__ ((noinline));
+
+static void dtrmm_kernel_4x8( BLASLONG n, FLOAT *alpha ,FLOAT *a, FLOAT *b, FLOAT *C0, FLOAT *C1, FLOAT *C2,FLOAT *C3, FLOAT *C4, FLOAT *C5,FLOAT *C6, FLOAT *C7)
+{
+
+		BLASLONG I = 0;
+		BLASLONG temp1 = n * 8;
+
+		 __asm__  __volatile__
+        	(
+		"	vxorpd	%%ymm4 , %%ymm4 , %%ymm4			\n\t"
+		"	vxorpd	%%ymm5 , %%ymm5 , %%ymm5			\n\t"
+		"	vxorpd	%%ymm6 , %%ymm6 , %%ymm6			\n\t"
+		"	vxorpd	%%ymm7 , %%ymm7 , %%ymm7			\n\t"
+		"	vxorpd	%%ymm8 , %%ymm8 , %%ymm8			\n\t"
+		"	vxorpd	%%ymm9 , %%ymm9 , %%ymm9			\n\t"
+		"	vxorpd	%%ymm10, %%ymm10, %%ymm10			\n\t"
+		"	vxorpd	%%ymm11, %%ymm11, %%ymm11			\n\t"
+
+		"	cmp $0, %1						\n\t"
+		"	jz 2f							\n\t"
+
+		"	.align 16						\n\t"
+		"1:								\n\t"
+		"	vmovups   	(%2,%0,4) , %%ymm0			\n\t"
+		"	vmovups   	(%3,%0,8) , %%ymm1			\n\t"
+		"	vmovups       32(%3,%0,8) , %%ymm2			\n\t"
+
+		"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm4		\n\t"
+		"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm8		\n\t"
+
+		"	vpermpd         $0xb1  , %%ymm0 , %%ymm0		\n\t"
+		"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm5		\n\t"
+		"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm9		\n\t"
+
+		"	vpermpd         $0x1b  , %%ymm0 , %%ymm0		\n\t"
+		"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm6		\n\t"
+		"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm10		\n\t"
+
+		"	vpermpd         $0xb1  , %%ymm0 , %%ymm0		\n\t"
+		"	vfmadd231pd	%%ymm0 , %%ymm1 , %%ymm7		\n\t"
+		"	vfmadd231pd	%%ymm0 , %%ymm2 , %%ymm11		\n\t"
+
+		"	addq	$8 , %0						\n\t"
+		"	cmp	%0 , %1						\n\t"
+		"	jne	1b						\n\t"
+
+		"2:								\n\t"
+
+		"	vbroadcastsd	(%4), %%ymm0				\n\t"
+
+		"	vmulpd		%%ymm0 , %%ymm4 , %%ymm4		\n\t"
+		"	vmulpd		%%ymm0 , %%ymm5 , %%ymm5		\n\t"
+		"	vmulpd		%%ymm0 , %%ymm6 , %%ymm6		\n\t"
+		"	vmulpd		%%ymm0 , %%ymm7 , %%ymm7		\n\t"
+		"	vmulpd		%%ymm0 , %%ymm8 , %%ymm8		\n\t"
+		"	vmulpd		%%ymm0 , %%ymm9 , %%ymm9		\n\t"
+		"	vmulpd		%%ymm0 , %%ymm10, %%ymm10		\n\t"
+		"	vmulpd		%%ymm0 , %%ymm11, %%ymm11		\n\t"
+
+		"	vpermpd 	$0xb1  , %%ymm5 , %%ymm5		\n\t"
+		"	vpermpd 	$0xb1  , %%ymm7 , %%ymm7		\n\t"
+
+		"	vblendpd 	$0x0a  , %%ymm5 , %%ymm4 , %%ymm0	\n\t"
+		"	vblendpd 	$0x05  , %%ymm5 , %%ymm4 , %%ymm1	\n\t"
+		"	vblendpd 	$0x0a  , %%ymm7 , %%ymm6 , %%ymm2	\n\t"
+		"	vblendpd 	$0x05  , %%ymm7 , %%ymm6 , %%ymm3	\n\t"
+
+		"	vpermpd 	$0x1b  , %%ymm2 , %%ymm2		\n\t"
+		"	vpermpd 	$0x1b  , %%ymm3 , %%ymm3		\n\t"
+		"	vpermpd 	$0xb1  , %%ymm2 , %%ymm2		\n\t"
+		"	vpermpd 	$0xb1  , %%ymm3 , %%ymm3		\n\t"
+
+		"	vblendpd 	$0x03  , %%ymm0 , %%ymm2 , %%ymm4	\n\t"
+		"	vblendpd 	$0x03  , %%ymm1 , %%ymm3 , %%ymm5	\n\t"
+		"	vblendpd 	$0x03  , %%ymm2 , %%ymm0 , %%ymm6	\n\t"
+		"	vblendpd 	$0x03  , %%ymm3 , %%ymm1 , %%ymm7	\n\t"
+
+		"	vmovups		%%ymm4 , (%5)				\n\t"
+		"	vmovups		%%ymm5 , (%6)				\n\t"
+		"	vmovups		%%ymm6 , (%7)				\n\t"
+		"	vmovups		%%ymm7 , (%8)				\n\t"
+
+		"	vpermpd 	$0xb1  , %%ymm9 , %%ymm9		\n\t"
+		"	vpermpd 	$0xb1  , %%ymm11, %%ymm11		\n\t"
+
+		"	vblendpd 	$0x0a  , %%ymm9 , %%ymm8 , %%ymm0	\n\t"
+		"	vblendpd 	$0x05  , %%ymm9 , %%ymm8 , %%ymm1	\n\t"
+		"	vblendpd 	$0x0a  , %%ymm11, %%ymm10, %%ymm2	\n\t"
+		"	vblendpd 	$0x05  , %%ymm11, %%ymm10, %%ymm3	\n\t"
+
+		"	vpermpd 	$0x1b  , %%ymm2 , %%ymm2		\n\t"
+		"	vpermpd 	$0x1b  , %%ymm3 , %%ymm3		\n\t"
+		"	vpermpd 	$0xb1  , %%ymm2 , %%ymm2		\n\t"
+		"	vpermpd 	$0xb1  , %%ymm3 , %%ymm3		\n\t"
+
+		"	vblendpd 	$0x03  , %%ymm0 , %%ymm2 , %%ymm4	\n\t"
+		"	vblendpd 	$0x03  , %%ymm1 , %%ymm3 , %%ymm5	\n\t"
+		"	vblendpd 	$0x03  , %%ymm2 , %%ymm0 , %%ymm6	\n\t"
+		"	vblendpd 	$0x03  , %%ymm3 , %%ymm1 , %%ymm7	\n\t"
+
+		"	vmovups		%%ymm4 , (%9)				\n\t"
+		"	vmovups		%%ymm5 , (%10)				\n\t"
+		"	vmovups		%%ymm6 , (%11)				\n\t"
+		"	vmovups		%%ymm7 , (%12)				\n\t"
+
+	        :
+        	:
+		"a" (I),	 // 0
+          	"r" (temp1),     // 1    
+          	"S" (a),         // 2
+          	"D" (b),         // 3
+          	"r" (alpha),     // 4
+		"r" (C0),	 // 5
+		"r" (C1),	 // 6
+		"r" (C2),	 // 7
+		"r" (C3),	 // 8
+		"r" (C4),	 // 9
+		"r" (C5),	 // 10
+		"r" (C6),	 // 11
+		"r" (C7) 	 // 12
+		: "cc",
+          	"%xmm0", "%xmm1", "%xmm2", "%xmm3",
+          	"%xmm4", "%xmm5", "%xmm6", "%xmm7",
+          	"%xmm8", "%xmm9", "%xmm10", "%xmm11",
+          	"%xmm12", "%xmm13", "%xmm14", "%xmm15",
+          	"memory"
+        	);
+
+
+
+
+}
+
+
+
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*C4,*C5,*C6,*C7,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+
+   FLOAT res4_0;
+   FLOAT res4_1;
+   FLOAT res4_2;
+   FLOAT res4_3;
+
+   FLOAT res5_0;
+   FLOAT res5_1;
+   FLOAT res5_2;
+   FLOAT res5_3;
+
+   FLOAT res6_0;
+   FLOAT res6_1;
+   FLOAT res6_2;
+   FLOAT res6_3;
+
+   FLOAT res7_0;
+   FLOAT res7_1;
+   FLOAT res7_2;
+   FLOAT res7_3;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+   FLOAT b4;
+   FLOAT b5;
+   FLOAT b6;
+   FLOAT b7;
+
+   BLASLONG off, temp ;
+
+   bool left;
+   bool transposed;
+   bool backwards;
+
+#ifdef LEFT
+   left = true;
+#else
+   left = false;
+#endif
+
+#ifdef TRANSA
+   transposed = true;
+#else
+   transposed = false;
+#endif
+
+   backwards = left != transposed;
+
+   if (!left) {
+      off = -offset;
+   }
+
+
+   for (j=0; j<bn/8; j+=1) // do blocks of the Mx8 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+        C4 = C3+ldc;
+        C5 = C4+ldc;
+        C6 = C5+ldc;
+        C7 = C6+ldc;
+
+
+        if (left) {
+            off = offset;
+        }
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+	{
+
+		ptrbb = bb;
+                if (backwards)
+                {
+		   ptrba += off*4; // number of values in A
+		   ptrbb += off*8; // number of values in B
+                }
+/*
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+		res4_2 = 0;
+		res4_3 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+		res5_2 = 0;
+		res5_3 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+		res6_2 = 0;
+		res6_3 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+		res7_2 = 0;
+		res7_3 = 0;
+*/
+                temp = backwards ? bk-off :
+                             left ? off + 4 : // number of values in A
+                                    off + 8;  // number of values in B
+
+	        dtrmm_kernel_4x8( temp, &alpha , ptrba, ptrbb, C0, C1, C2, C3, C4, C5, C6, C7);
+
+		ptrba = ptrba + temp * 4;
+		ptrbb = ptrbb + temp * 8;
+
+/*
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+			res4_2 += a0*b4;
+			res5_2 += a0*b5;
+			res6_2 += a0*b6;
+			res7_2 += a0*b7;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+			res4_3 += a1*b4;
+			res5_3 += a1*b5;
+			res6_3 += a1*b6;
+			res7_3 += a1*b7;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+		res4_2 *= alpha;
+		res4_3 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+		res5_2 *= alpha;
+		res5_3 *= alpha;
+
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+		res6_2 *= alpha;
+		res6_3 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+		res7_2 *= alpha;
+		res7_3 *= alpha;
+
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+		C4[2] = res4_2;
+		C4[3] = res4_3;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+		C5[2] = res5_2;
+		C5[3] = res5_3;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+		C6[2] = res6_2;
+		C6[3] = res6_3;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+		C7[2] = res7_2;
+		C7[3] = res7_3;
+*/
+		if (!backwards) {
+                    temp = bk-off;
+                    temp = left ? temp - 4 : // number of values in A
+                                  temp - 8;  // number of values in B
+
+                    ptrba += temp*4; // number of values in A
+		    ptrbb += temp*8; // number of values in B
+                }
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+		C4 = C4+4;
+		C5 = C5+4;
+		C6 = C6+4;
+		C7 = C7+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+		res4_0 = 0;
+		res4_1 = 0;
+
+		res5_0 = 0;
+		res5_1 = 0;
+
+		res6_0 = 0;
+		res6_1 = 0;
+
+		res7_0 = 0;
+		res7_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+			res4_1 += a1*b4;
+			res5_1 += a1*b5;
+			res6_1 += a1*b6;
+			res7_1 += a1*b7;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		res4_0 *= alpha;
+		res4_1 *= alpha;
+
+		res5_0 *= alpha;
+		res5_1 *= alpha;
+
+		res6_0 *= alpha;
+		res6_1 *= alpha;
+
+		res7_0 *= alpha;
+		res7_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+		C4[0] = res4_0;
+		C4[1] = res4_1;
+
+		C5[0] = res5_0;
+		C5[1] = res5_1;
+
+		C6[0] = res6_0;
+		C6[1] = res6_1;
+
+		C7[0] = res7_0;
+		C7[1] = res7_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+		C4 = C4+2;
+		C5 = C5+2;
+		C6 = C6+2;
+		C7 = C7+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*8;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+		res4_0 = 0;
+		res5_0 = 0;
+		res6_0 = 0;
+		res7_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+8;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+			b4 = ptrbb[4];
+			b5 = ptrbb[5];
+			b6 = ptrbb[6];
+			b7 = ptrbb[7];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+			res4_0 += a0*b4;
+			res5_0 += a0*b5;
+			res6_0 += a0*b6;
+			res7_0 += a0*b7;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+8;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+		res4_0 *= alpha;
+		res5_0 *= alpha;
+		res6_0 *= alpha;
+		res7_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+		C4[0] = res4_0;
+		C5[0] = res5_0;
+		C6[0] = res6_0;
+		C7[0] = res7_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 8; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*8;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+		C4 = C4+1;
+		C5 = C5+1;
+		C6 = C6+1;
+		C7 = C7+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 8;
+#endif
+
+        k = (bk<<3);
+        bb = bb+k;
+        i = (ldc<<3);
+        C = C+i;
+    }
+
+
+
+   for (j=0; j<(bn&4); j+=4) // do blocks of the Mx4 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+
+        if (left) {
+            off = offset;
+        }
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+	{
+
+		ptrbb = bb;
+                if (backwards)
+                {
+		   ptrba += off*4; // number of values in A
+		   ptrbb += off*4; // number of values in B
+                }
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+                temp = backwards ? bk-off :
+                             left ? off + 4 : // number of values in A
+                                    off + 4;  // number of values in B
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		if (!backwards) {
+                    temp = bk-off;
+                    temp = left ? temp - 4 : // number of values in A
+                                  temp - 4;  // number of values in B
+
+                    ptrba += temp*4; // number of values in A
+		    ptrbb += temp*4; // number of values in B
+                }
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+
+
+   for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+		off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/param.h b/param.h
index 245b678ef..6c9ca83da 100644
--- a/param.h
+++ b/param.h
@@ -1414,7 +1414,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define XGEMM_DEFAULT_UNROLL_M 1
 
 #define SGEMM_DEFAULT_UNROLL_N 4
-#define DGEMM_DEFAULT_UNROLL_N 4
+#define DGEMM_DEFAULT_UNROLL_N 8
 #define QGEMM_DEFAULT_UNROLL_N 2
 #define CGEMM_DEFAULT_UNROLL_N 2
 #define ZGEMM_DEFAULT_UNROLL_N 2