diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md
index 18a218cec..02d15b7f3 100644
--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@@ -117,5 +117,9 @@ In chronological order:
 * Isaac Dunham <https://github.com/idunham>
   * [2014-08-03] Fixed link error on Linux/musl
 
+* Dave Nuechterlein
+  * [2014-10-10] trmm and sgemm kernels (optimized for APM's X-Gene 1).
+                 ARMv8 support.
+
 * [Your name or handle] <[email or website]>
   * [Date] [Brief summary of your changes]
diff --git a/common_arm64.h b/common_arm64.h
index 8a66a1702..4855493da 100644
--- a/common_arm64.h
+++ b/common_arm64.h
@@ -119,9 +119,9 @@ static inline int blas_quickdivide(blasint x, blasint y){
 }
 
 #if defined(DOUBLE)
-#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f64 d1, %0" : "=m"(res) : : "memory")
+#define GET_IMAGE(res)  __asm__ __volatile__("str d1, %0" : "=m"(res) : : "memory")
 #else
-#define GET_IMAGE(res)  __asm__ __volatile__("vstr.f32 s1, %0" : "=m"(res) : : "memory")
+#define GET_IMAGE(res)  __asm__ __volatile__("str s1, %0" : "=m"(res) : : "memory")
 #endif
 
 #define GET_IMAGE_CANCEL
@@ -138,7 +138,6 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #if defined(ASSEMBLER) && !defined(NEEDPARAM)
 
 #define PROLOGUE \
-	.arm		 ;\
 	.global	REALNAME ;\
 	.func	REALNAME  ;\
 REALNAME:
diff --git a/cpuid_arm64.c b/cpuid_arm64.c
new file mode 100644
index 000000000..c7a27f891
--- /dev/null
+++ b/cpuid_arm64.c
@@ -0,0 +1,217 @@
+/**************************************************************************
+  Copyright (c) 2013, The OpenBLAS Project
+  All rights reserved.
+  Redistribution and use in source and binary forms, with or without
+  modification, are permitted provided that the following conditions are
+  met:
+  1. Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+  2. Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in
+  the documentation and/or other materials provided with the
+  distribution.
+  3. Neither the name of the OpenBLAS project nor the names of
+  its contributors may be used to endorse or promote products
+  derived from this software without specific prior written permission.
+  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+  AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+  IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+  ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+  DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+  SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+  CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+  OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+  USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+  *****************************************************************************/
+
+#include <string.h>
+
+#define CPU_UNKNOWN     	0
+#define CPU_ARMV8       	1
+
+static char *cpuname[] = {
+  "UNKOWN",
+  "ARMV8"
+};
+
+
+int get_feature(char *search)
+{
+
+#ifdef linux
+	FILE *infile;
+  	char buffer[2048], *p,*t;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if (!strncmp("Features", buffer, 8))
+		{
+			p = strchr(buffer, ':') + 2;
+			break;
+      		}
+  	}
+
+  	fclose(infile);
+
+
+	if( p == NULL ) return;
+
+	t = strtok(p," ");
+	while( t = strtok(NULL," "))
+	{
+		if (!strcmp(t, search))   { return(1); }
+	}
+
+#endif
+	return(0);
+}
+
+
+int detect(void)
+{
+
+#ifdef linux
+
+	FILE *infile;
+  	char buffer[512], *p;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if ((!strncmp("model name", buffer, 10)) || (!strncmp("Processor", buffer, 9)))
+		{
+			p = strchr(buffer, ':') + 2;
+			break;
+      		}
+  	}
+
+  	fclose(infile);
+
+  	if(p != NULL)
+	{
+
+		if (strstr(p, "AArch64"))
+		{
+			 	return CPU_ARMV8;
+
+		}
+
+
+	}
+#endif
+
+	return CPU_UNKNOWN;
+}
+
+char *get_corename(void)
+{
+	return cpuname[detect()];
+}
+
+void get_architecture(void)
+{
+	printf("ARM");
+}
+
+void get_subarchitecture(void)
+{
+	int d = detect();
+	switch (d)
+	{
+
+		case CPU_ARMV8:
+			printf("ARMV8");
+			break;
+
+		default:
+			printf("UNKNOWN");
+			break;
+	}
+}
+
+void get_subdirname(void)
+{
+	printf("arm64");
+}
+
+void get_cpuconfig(void)
+{
+
+	int d = detect();
+	switch (d)
+	{
+
+		case CPU_ARMV8:
+    			printf("#define ARMV8\n");
+    			printf("#define L1_DATA_SIZE 32768\n");
+    			printf("#define L1_DATA_LINESIZE 64\n");
+    			printf("#define L2_SIZE 262144\n");
+    			printf("#define L2_LINESIZE 64\n");
+    			printf("#define DTB_DEFAULT_ENTRIES 64\n");
+    			printf("#define DTB_SIZE 4096\n");
+    			printf("#define L2_ASSOCIATIVE 4\n");
+			break;
+
+
+	}
+}
+
+
+void get_libname(void)
+{
+
+	int d = detect();
+	switch (d)
+	{
+
+		case CPU_ARMV8:
+    			printf("armv8\n");
+			break;
+
+	}
+}
+
+
+void get_features(void)
+{
+
+#ifdef linux
+	FILE *infile;
+  	char buffer[2048], *p,*t;
+  	p = (char *) NULL ;
+
+  	infile = fopen("/proc/cpuinfo", "r");
+
+	while (fgets(buffer, sizeof(buffer), infile))
+	{
+
+		if (!strncmp("Features", buffer, 8))
+		{
+			p = strchr(buffer, ':') + 2;
+			break;
+      		}
+  	}
+
+  	fclose(infile);
+
+
+	if( p == NULL ) return;
+
+	t = strtok(p," ");
+	while( t = strtok(NULL," "))
+	{
+	}
+
+#endif
+	return;
+}
+
+
diff --git a/getarch.c b/getarch.c
index 3e9914259..ded347ecc 100644
--- a/getarch.c
+++ b/getarch.c
@@ -746,12 +746,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define SUBARCHITECTURE "ARMV8"
 #define SUBDIRNAME      "arm64"
 #define ARCHCONFIG   "-DARMV8 " \
-       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
-       "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
-       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " \
-       "-DHAVE_VFP -DHAVE_VFPV3 -DHAVE_VFPV4"
+       "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=64 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=64 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=32 " 
 #define LIBNAME   "armv8"
-#define CORENAME  "ARMV8"
+#define CORENAME  "XGENE1"
 #else
 #endif
 
@@ -801,6 +800,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif
 
+#ifdef __aarch64__
+#include "cpuid_arm64.c"
+#define OPENBLAS_SUPPORTED
+#endif
+
 
 #ifndef OPENBLAS_SUPPORTED
 #error "This arch/CPU is not supported by OpenBLAS."
@@ -856,7 +860,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("CORE=%s\n", CORENAME);
 #else
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
     printf("CORE=%s\n", get_corename());
 #endif
 #endif
@@ -956,7 +960,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
     printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
 #else
-#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__)
+#if defined(__i386__) || defined(__x86_64__) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__)
     printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
 #endif
 #endif
diff --git a/kernel/arm64/KERNEL.ARMV8 b/kernel/arm64/KERNEL.ARMV8
index 27157dad1..4fc0968cd 100644
--- a/kernel/arm64/KERNEL.ARMV8
+++ b/kernel/arm64/KERNEL.ARMV8
@@ -80,14 +80,14 @@ DGEMVTKERNEL = ../arm/gemv_t.c
 CGEMVTKERNEL = ../arm/zgemv_t.c
 ZGEMVTKERNEL = ../arm/zgemv_t.c
 
-STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+STRMMKERNEL	= ../generic/trmmkernel_4x4.c
 DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
 CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
 ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
 
-SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
-SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMKERNEL    =  sgemm_kernel_4x4.S
+SGEMMONCOPY    =  ../generic/gemm_ncopy_4.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_4.c
 SGEMMONCOPYOBJ =  sgemm_oncopy.o
 SGEMMOTCOPYOBJ =  sgemm_otcopy.o
 
diff --git a/kernel/arm64/sgemm_kernel_4x4.S b/kernel/arm64/sgemm_kernel_4x4.S
new file mode 100644
index 000000000..78633297f
--- /dev/null
+++ b/kernel/arm64/sgemm_kernel_4x4.S
@@ -0,0 +1,1327 @@
+/***************************************************************************
+Copyright (c) 2013, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+/**************************************************************************************
+* 2013/11/23 Saar
+* 	 BLASTEST 		: OK
+* 	 CTEST			: OK
+* 	 TEST			: OK
+*
+*
+* 2013/11/02 Saar
+*	UNROLL_N		4
+*	UNROLL_M		4
+*	DGEMM_P			128
+*	DGEMM_Q			240
+*	DGEMM_R			12288
+*	A_PRE			128
+*	B_PRE			128
+*	C_PRE			32
+*
+* Performance on Odroid U2:
+*
+* 3072x3072		1 Core:		2.62 GFLOPS	ATLAS: 2.69	GFLOPS
+* 3072x3072		2 Cores:	5.23 GFLOPS	ATLAS: 5.27	GFLOPS
+* 3072x3072		3 Cores:	7.78 GFLOPS	ATLAS: 7.87	GFLOPS
+* 3072x3072		4 Cores:       10.10 GFLOPS	ATLAS: 9.98	GFLOPS
+**************************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+/*                   X0          X1          X2          s0        X3        x4       x5           x6*/
+/*int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc*/
+
+
+#define origM x0
+#define origN x1
+#define origK x2
+#define origPA x3
+#define origPB x4
+#define pC x5
+#define LDC x6
+#define offset x7
+#define counterL x8
+#define counterI x9
+#define pB x10
+#define counterJ x11
+#define tempALPHA x12
+#define pCRow0 x13
+#define pCRow1 x14
+#define pCRow2 x15
+#define pA x16
+
+// 00 origM
+// 01 origN
+// 02 origK
+// 03 origPA
+// 04 origPB
+// 05 pC
+// 06 origLDC -> LDC
+// 07 offset
+// 08 counterL
+// 09 counterI
+// 10 pB
+// 11 counterJ
+// 12 tempALPHA      
+// 13 pCRow0
+// 14 pCRow1
+// 15 pCRow2
+// 16 pA
+// 17
+// 18 must save
+// 19 must save
+// 20 must save
+// 21 must save
+// 22 must save
+// 23 must save
+// 24 must save
+// 25 must save
+// 26 must save
+// 27 must save
+// 28 must save
+// 29 frame
+// 30 link
+// 31 sp
+
+//v00 orig ALPHA -> a00
+//v01 a01
+//v02 a02
+//v03 a03
+//v04 a10
+//v05 a11
+//v06 a12
+//v07 a13
+//v08 must save b00
+//v09 must save b01
+//v10 must save b02
+//v11 must save b03
+//v12 must save b10
+//v13 must save b11
+//v14 must save b12
+//v15 must save b13
+//v16 must save  C00
+//v17 must save  C01
+//v18  C02
+//v19  C03
+//v20  C10
+//v21  C11
+//v22  C12
+//v23  C13
+//v24  C20
+//v25  C21
+//v26  C22
+//v27  C23
+//v28  C30
+//v29  C31
+//v30  C32
+//v31  C33
+
+//        add     sp,sp,#-(6*16)
+//        stp     x18,x19,[sp,#(0*16)]
+//        stp     x20,x21,[sp,#(1*16)]
+
+
+/**************************************************************************************
+* Macro definitions
+**************************************************************************************/
+
+.macro INIT4x4
+
+	fsub     	v16.4s , v16.4s , v16.4s
+	fsub     	v20.4s , v20.4s , v20.4s
+	fsub     	v24.4s , v24.4s , v24.4s
+	fsub     	v28.4s , v28.4s , v28.4s
+
+.endm
+
+.macro KERNEL4x4_I
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+        ld1     {v0.4s},[pA],#16
+
+        fmulx   v16.4s, v0.4s, v8.4s[0]
+        fmulx   v20.4s, v0.4s, v8.4s[1]
+	fmulx	v24.4s, v0.4s, v10.4s[0]
+	fmulx	v28.4s, v0.4s, v10.4s[1]
+
+        ld1     {v12.2s},[pB],#8   // for next round
+        ld1     {v14.2s},[pB],#8   // for next round
+        ld1     {v4.4s},[pA],#16   // for next round
+
+
+.endm
+
+
+.macro KERNEL4x4_M2
+
+	fmla  	v16.4s, v4.4s, v12.s[0]
+	fmla  	v20.4s, v4.4s, v12.s[1]
+	fmla 	v24.4s, v4.4s, v14.s[0]
+	fmla  	v28.4s, v4.4s, v14.s[1]
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+        ld1     {v0.4s},[pA],#16
+
+.endm
+
+
+.macro KERNEL4x4_M1
+
+	fmla 	v16.4s, v0.4s, v8.s[0]
+	fmla 	v20.4s, v0.4s, v8.s[1]
+	fmla 	v24.4s, v0.4s, v10.s[0]
+	fmla 	v28.4s, v0.4s, v10.s[1]
+
+        ld1     {v12.2s},[pB],#8
+        ld1     {v14.2s},[pB],#8
+        ld1     {v4.4s},[pA],#16
+
+.endm
+
+
+
+.macro KERNEL4x4_E
+
+	fmla 	v16.4s, v4.4s, v12.s[0]
+	fmla 	v20.4s, v4.4s, v12.s[1]
+	fmla 	v24.4s, v4.4s, v14.s[0]
+	fmla 	v28.4s, v4.4s, v14.s[1]
+
+.endm
+
+
+
+
+.macro KERNEL4x4_SUB
+
+        ld1     {v8.2s},[pB],#8
+        ld1     {v10.2s},[pB],#8
+	ld1	{v0.4s} , [pA],#16
+
+	fmla 	v16.4s, v0.4s, v8.s[0]
+	fmla 	v20.4s, v0.4s, v8.s[1]
+	fmla 	v24.4s, v0.4s, v10.s[0]
+	fmla 	v28.4s, v0.4s, v10.s[1]
+
+.endm
+
+
+
+
+.macro SAVE4x4
+
+	add	pCRow1, pCRow0, LDC    // create a second row pointer from the first row pointer
+	mov	v0.d[0], tempALPHA
+
+        ld1     {v8.4s},[pCRow0]   // load 4 values of C from first row
+        fmla     v8.4s ,v16.4s,v0.s[0]
+	st1 	{v8.4s},[pCRow0],#16 // store C from first row
+
+        ld1     {v12.4s},[pCRow1]   // load 4 values of C from second row
+        fmla     v12.4s ,v20.4s,v0.s[0]
+	st1 	{v12.4s},[pCRow1] // store C from second row
+
+	add	pCRow2, pCRow1, LDC        // Row2 points to third row 
+
+        ld1     {v8.4s},[pCRow2]   // load 4 values of C from third row
+        fmla     v8.4s ,v24.4s,v0.s[0]
+	st1 	{v8.4s} ,[pCRow2]  // store C from third row
+
+	add	pCRow1, pCRow2 , LDC // row1 points to fourth row
+
+        ld1     {v12.4s},[pCRow1]   // load 4 values of C from fourth row
+        fmla     v12.4s ,v28.4s,v0.s[0]
+	st1     {v12.4s},[pCRow1]  // store fourth row
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT2x4
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+	fmov		s20, s16
+	fmov		s21, s16
+	fmov		s24, s16
+	fmov		s25, s16
+	fmov		s28, s16
+	fmov		s29, s16
+
+.endm
+
+
+
+.macro KERNEL2x4_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+	ldr	s10, [ pB, #8 ]
+	ldr	s11, [ pB, #12 ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+
+	fmadd 	s20  , s0,  s9,	s20  
+	fmadd 	s21  , s1,  s9,	s21  
+
+	fmadd 	s24  , s0,  s10,	s24  
+	fmadd 	s25  , s1,  s10,	s25  
+
+	fmadd 	s28  , s0,  s11,	s28  
+	fmadd 	s29  , s1,  s11,	s29  
+	add	pA , pA, #8
+	add	pB , pB, #16
+
+.endm
+
+            #define F1ST( op1, op2, op3) fmadd op1, op2, op3, op1
+            #define L1ST( op1, op2, op3) ldr op1, [op2,  op3]
+
+.macro SAVE2x4
+
+	add	pCRow1 , pCRow0, LDC
+	add	pCRow2  , pCRow1, LDC
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0)
+	L1ST (	s9,pCRow0, #4 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+
+	str 	s8 , [pCRow0, #0]
+	str 	s9 , [pCRow0, #4 ]
+
+	ldr	s12, [pCRow1, #0]
+	ldr	s13, [pCRow1, #4 ]
+
+	F1ST ( 	s12, s0 , s20)
+	F1ST ( 	s13, s0 , s21)
+
+	str 	s12, [pCRow1, #0]
+	str 	s13, [pCRow1, #4 ]
+
+	L1ST (	s8,pCRow2 , #0)
+	L1ST (	s9,pCRow2 , #4 )
+
+	F1ST ( 	s8 , s0 , s24)
+	F1ST ( 	s9 , s0 , s25)
+
+	str 	s8 , [pCRow2 , #0]
+	str 	s9 , [pCRow2 , #4 ]
+
+	add	pCRow1, pCRow2 , LDC
+
+	ldr	s12, [pCRow1, #0]
+	ldr	s13, [pCRow1, #4 ]
+
+	F1ST ( 	s12, s0 , s28)
+	F1ST ( 	s13, s0 , s29)
+
+	str 	s12, [pCRow1, #0]
+	str 	s13, [pCRow1, #4 ]
+
+	add	pCRow0, pCRow0, #8
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT1x4
+
+	fsub		s16 , s16 , s16
+	fmov		s20, s16
+	fmov		s24, s16
+	fmov		s28, s16
+
+.endm
+
+
+
+.macro KERNEL1x4_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+	ldr	s10, [ pB, #8 ]
+	ldr	s11, [ pB, #12 ]
+
+	ldr	s0 , [ pA ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s20  , s0,  s9,	s20  
+	fmadd 	s24  , s0,  s10,	s24  
+	fmadd 	s28  , s0,  s11,	s28  
+
+	add	pA , pA, #4
+	add	pB , pB, #16
+
+.endm
+
+.macro SAVE1x4
+
+	add	pCRow1 , pCRow0, LDC
+	add	pCRow2  , pCRow1, LDC
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0)
+	F1ST ( 	s8 , s0 , s16)
+	str 	s8 , [pCRow0, #0]
+
+	L1ST (	s12,pCRow1, #0)
+	F1ST ( 	s12, s0 , s20)
+	str 	s12, [pCRow1, #0]
+
+	L1ST (	s8,pCRow2 , #0)
+	F1ST ( 	s8 , s0 , s24)
+	str 	s8 , [pCRow2 , #0]
+
+	add	pCRow1, pCRow2 , LDC
+
+	L1ST (	s12,pCRow1, #0)
+	F1ST ( 	s12, s0 , s28)
+	str 	s12, [pCRow1, #0]
+
+	add	pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x2
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+	fmov		s18, s16
+	fmov		s19, s16
+	fmov		s20, s16
+	fmov		s21, s16
+	fmov		s22, s16
+	fmov		s23, s16
+
+.endm
+
+
+
+.macro KERNEL4x2_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+	ldr	s2 , [ pA, #8 ]
+	ldr	s3 , [ pA, #12 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+	fmadd 	s18  , s2,  s8,	s18  
+	fmadd 	s19  , s3,  s8,	s19  
+
+	fmadd 	s20  , s0,  s9,	s20  
+	fmadd 	s21  , s1,  s9,	s21  
+	fmadd 	s22  , s2,  s9,	s22  
+	fmadd 	s23  , s3,  s9,	s23  
+
+	add	pA , pA, #16
+	add	pB , pB, #8
+
+.endm
+
+.macro SAVE4x2
+
+	add	pCRow1 , pCRow0, LDC
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0)
+	L1ST (	s9,pCRow0, #4 )
+	L1ST (	s10,pCRow0, #8 )
+	L1ST (	s11,pCRow0, #12 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+	F1ST ( 	s10, s0 , s18)
+	F1ST ( 	s11, s0 , s19)
+
+	str 	s8 , [pCRow0]
+	str 	s9 , [pCRow0, #4 ]
+	str 	s10, [pCRow0, #8 ]
+	str 	s11, [pCRow0, #12 ]
+
+	L1ST (	s12,pCRow1, #0)
+	L1ST (	s13,pCRow1, #4 )
+	L1ST (	s14,pCRow1, #8 )
+	L1ST (	s15,pCRow1, #12 )
+
+	F1ST ( 	s12, s0 , s20)
+	F1ST ( 	s13, s0 , s21)
+	F1ST ( 	s14, s0 , s22)
+	F1ST ( 	s15, s0 , s23)
+
+	str 	s12, [pCRow1]
+	str 	s13, [pCRow1, #4 ]
+	str 	s14, [pCRow1, #8 ]
+	str 	s15, [pCRow1, #12 ]
+
+	add	pCRow0, pCRow0, #16
+
+.endm
+
+
+/******************************************************************************/
+
+.macro INIT2x2
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+	fmov		s20, s16
+	fmov		s21, s16
+
+.endm
+
+
+
+.macro KERNEL2x2_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+
+	fmadd 	s20  , s0,  s9,	s20  
+	fmadd 	s21  , s1,  s9,	s21  
+
+	add	pA , pA, #8
+	add	pB , pB, #8
+
+.endm
+
+.macro SAVE2x2
+
+	add	pCRow1 , pCRow0, LDC
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0 )
+	L1ST (	s9,pCRow0, #4 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+
+	str 	s8 , [pCRow0]
+	str 	s9 , [pCRow0, #4 ]
+
+	L1ST (	s12,pCRow1, #0 )
+	L1ST (	s13,pCRow1, #4 )
+
+	F1ST ( 	s12, s0 , s20)
+	F1ST ( 	s13, s0 , s21)
+
+	str 	s12, [pCRow1]
+	str 	s13, [pCRow1, #4 ]
+
+	add	pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x2
+
+	fsub		s16 , s16 , s16
+	fmov		s20, s16
+
+.endm
+
+
+
+.macro KERNEL1x2_SUB
+
+	ldr	s8 , [ pB ]
+	ldr	s9 , [ pB, #4 ]
+
+	ldr	s0 , [ pA ]
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s20  , s0,  s9,	s20  
+
+	add	pA , pA, #4
+	add	pB , pB, #8
+
+.endm
+
+.macro SAVE1x2
+
+	add	pCRow1 , pCRow0, LDC
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0)
+	F1ST ( 	s8 , s0 , s16)
+	str 	s8 , [pCRow0]
+
+	L1ST (	s12,pCRow1, #0)
+	F1ST ( 	s12, s0 , s20)
+	str 	s12, [pCRow1]
+
+	add	pCRow0, pCRow0, #4
+
+.endm
+
+/******************************************************************************/
+/******************************************************************************/
+
+.macro INIT4x1
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+	fmov		s18, s16
+	fmov		s19, s16
+
+.endm
+
+
+
+.macro KERNEL4x1_SUB
+
+	ldr	s8 , [ pB ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+	ldr	s2 , [ pA, #8 ]
+	ldr	s3 , [ pA, #12 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+	fmadd 	s18  , s2,  s8,	s18  
+	fmadd 	s19  , s3,  s8,	s19  
+
+	add	pA , pA, #16
+	add	pB , pB, #4
+
+.endm
+
+.macro SAVE4x1
+
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0 )
+	L1ST (	s9,pCRow0, #4 )
+	L1ST (	s10,pCRow0, #8 )
+	L1ST (	s11,pCRow0, #12 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+	F1ST ( 	s10, s0 , s18)
+	F1ST ( 	s11, s0 , s19)
+
+	str 	s8 , [pCRow0]
+	str 	s9 , [pCRow0, #4 ]
+	str 	s10, [pCRow0, #8 ]
+	str 	s11, [pCRow0, #12 ]
+
+	add	pCRow0, pCRow0, #16
+
+.endm
+
+
+
+
+/******************************************************************************/
+
+.macro INIT2x1
+
+	fsub		s16 , s16 , s16
+	fmov		s17, s16
+
+.endm
+
+
+
+.macro KERNEL2x1_SUB
+
+	ldr	s8 , [ pB ]
+
+	ldr	s0 , [ pA ]
+	ldr	s1 , [ pA, #4 ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+	fmadd 	s17  , s1,  s8,	s17  
+
+	add	pA , pA, #8
+	add	pB , pB, #4
+
+.endm
+
+.macro SAVE2x1
+
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0 )
+	L1ST (	s9,pCRow0, #4 )
+
+	F1ST ( 	s8 , s0 , s16)
+	F1ST ( 	s9 , s0 , s17)
+
+	str 	s8 , [pCRow0]
+	str 	s9 , [pCRow0, #4 ]
+
+	add	pCRow0, pCRow0, #8
+
+.endm
+
+/******************************************************************************/
+
+.macro INIT1x1
+
+	fsub		s16 , s16 , s16
+
+.endm
+
+
+
+.macro KERNEL1x1_SUB
+
+	ldr	s8 , [ pB ]
+
+	ldr	s0 , [ pA ]
+
+	fmadd 	s16  , s0,  s8,	s16  
+
+	add	pA , pA, #4
+	add	pB , pB, #4
+
+.endm
+
+.macro SAVE1x1
+
+
+	mov	v0.d[0], tempALPHA
+
+	L1ST (	s8,pCRow0, #0 )
+	F1ST ( 	s8 , s0 , s16)
+	str 	s8 , [pCRow0]
+
+	add	pCRow0, pCRow0, #4
+
+.endm
+
+
+
+
+
+/**************************************************************************************
+* End of macro definitions
+**************************************************************************************/
+
+	PROLOGUE
+
+	.align 5
+        add     sp,sp,#-(5*16)
+        stp     d8,d9,[sp,#(0*16)]
+        stp     d10,d11,[sp,#(1*16)]
+        stp     d12,d13,[sp,#(2*16)]
+        stp     d14,d15,[sp,#(3*16)]
+        stp     d16,d17,[sp,#(4*16)]
+
+        mov     tempALPHA, v0.d[0]
+	lsl	LDC, LDC, #2					// ldc = ldc * 4
+
+	mov	pB, origPB
+
+	mov	counterJ, origN
+	asr 	counterJ, counterJ, #2					// J = J / 4
+	cmp 	counterJ, #0
+	ble	sgemm_kernel_L2_BEGIN
+
+sgemm_kernel_L4_BEGIN:
+
+	mov	pCRow0, pC						// pCRow0 = C
+        add     pC,pC,LDC, lsl #2
+
+	mov	pA, origPA						// pA = start of A array
+
+
+
+sgemm_kernel_L4_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2					// counterI = counterI / 4
+	cmp 	counterI, #0
+	ble	sgemm_kernel_L4_M2_BEGIN
+
+sgemm_kernel_L4_M4_20:
+
+	mov	pB, origPB
+	asr 	counterL , origK, #1					// L = K / 2
+	cmp	counterL , #2                                           // is there at least 4 to do?
+	blt	sgemm_kernel_L4_M4_32
+
+
+
+	KERNEL4x4_I     //do one in the K
+	KERNEL4x4_M2    //do another in the K
+
+	subs	counterL, counterL, #2  // subtract 2, since one is always done at the tail
+	ble	sgemm_kernel_L4_M4_22a
+	.align 5
+
+sgemm_kernel_L4_M4_22:
+
+	KERNEL4x4_M1
+	KERNEL4x4_M2
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M4_22
+
+sgemm_kernel_L4_M4_22a:
+
+	KERNEL4x4_M1
+	KERNEL4x4_E
+
+	b	 sgemm_kernel_L4_M4_44
+
+sgemm_kernel_L4_M4_32:   // less than 4 to do in the K direction
+
+	tst	counterL, #1
+	ble	sgemm_kernel_L4_M4_40
+
+	KERNEL4x4_I
+
+	KERNEL4x4_E
+
+	b	 sgemm_kernel_L4_M4_44
+
+
+sgemm_kernel_L4_M4_40:
+
+	INIT4x4
+
+
+sgemm_kernel_L4_M4_44:
+
+	ands	counterL , origK, #1
+	ble	sgemm_kernel_L4_M4_100
+
+sgemm_kernel_L4_M4_46:
+
+	KERNEL4x4_SUB
+
+	subs	counterL, counterL, #1
+	bne	sgemm_kernel_L4_M4_46
+
+sgemm_kernel_L4_M4_100:
+
+	SAVE4x4
+
+sgemm_kernel_L4_M4_END:
+
+	subs	counterI, counterI, #1
+	bne	sgemm_kernel_L4_M4_20
+
+
+sgemm_kernel_L4_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L4_END
+
+	tst	counterI, #2					// counterI = counterI / 2
+	ble	sgemm_kernel_L4_M1_BEGIN
+
+sgemm_kernel_L4_M2_20:
+
+	INIT2x4
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L4_M2_40
+
+sgemm_kernel_L4_M2_22:
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M2_22
+
+
+sgemm_kernel_L4_M2_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L4_M2_100
+
+sgemm_kernel_L4_M2_42:
+
+	KERNEL2x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M2_42
+
+sgemm_kernel_L4_M2_100:
+
+	SAVE2x4
+
+sgemm_kernel_L4_M2_END:
+
+
+sgemm_kernel_L4_M1_BEGIN:
+
+	tst	counterI, #1					// counterI = counterI % 2
+	ble	sgemm_kernel_L4_END
+
+sgemm_kernel_L4_M1_20:
+
+	INIT1x4
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L4_M1_40
+
+sgemm_kernel_L4_M1_22:
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M1_22
+
+
+sgemm_kernel_L4_M1_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L4_M1_100
+
+sgemm_kernel_L4_M1_42:
+
+	KERNEL1x4_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L4_M1_42
+
+sgemm_kernel_L4_M1_100:
+
+	SAVE1x4
+
+
+sgemm_kernel_L4_END:
+
+	add	origPB, origPB, origK, lsl #4 // B = B + K * 4 * 4
+
+	subs	counterJ, counterJ , #1						// j--
+	bgt	sgemm_kernel_L4_BEGIN
+
+
+
+/*********************************************************************************************/
+
+sgemm_kernel_L2_BEGIN:   // less than 2 left in N direction
+
+	mov	counterJ , origN
+	tst	counterJ , #3
+	ble	sgemm_kernel_L999   // error, N was less than 4?
+
+	tst	counterJ , #2
+	ble	sgemm_kernel_L1_BEGIN
+
+	mov	pCRow0, pC						// pCRow0 = pC
+	add	pC , pC, LDC, lsl #1
+
+	mov	pA, origPA						// pA = A
+
+
+
+sgemm_kernel_L2_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2					// counterI = counterI / 4
+	cmp	counterI,#0
+	ble	sgemm_kernel_L2_M2_BEGIN
+
+sgemm_kernel_L2_M4_20:
+
+	INIT4x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL,#0
+	ble	sgemm_kernel_L2_M4_40
+	.align 5
+
+sgemm_kernel_L2_M4_22:
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M4_22
+
+
+sgemm_kernel_L2_M4_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M4_100
+
+sgemm_kernel_L2_M4_42:
+
+	KERNEL4x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M4_42
+
+sgemm_kernel_L2_M4_100:
+
+	SAVE4x2
+
+sgemm_kernel_L2_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	sgemm_kernel_L2_M4_20
+
+
+sgemm_kernel_L2_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L2_END
+
+	tst	counterI, #2					// counterI = counterI / 2
+	ble	sgemm_kernel_L2_M1_BEGIN
+
+sgemm_kernel_L2_M2_20:
+
+	INIT2x2
+
+	mov	pB, origPB
+	asr	counterL , origK, #3					// counterL = counterL / 8
+        cmp	counterL,#0
+	ble	sgemm_kernel_L2_M2_40
+
+sgemm_kernel_L2_M2_22:
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M2_22
+
+
+sgemm_kernel_L2_M2_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M2_100
+
+sgemm_kernel_L2_M2_42:
+
+	KERNEL2x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M2_42
+
+sgemm_kernel_L2_M2_100:
+
+	SAVE2x2
+
+sgemm_kernel_L2_M2_END:
+
+
+sgemm_kernel_L2_M1_BEGIN:
+
+	tst	counterI, #1					// counterI = counterI % 2
+	ble	sgemm_kernel_L2_END
+
+sgemm_kernel_L2_M1_20:
+
+	INIT1x2
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+        cmp     counterL, #0
+	ble	sgemm_kernel_L2_M1_40
+
+sgemm_kernel_L2_M1_22:
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M1_22
+
+
+sgemm_kernel_L2_M1_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L2_M1_100
+
+sgemm_kernel_L2_M1_42:
+
+	KERNEL1x2_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L2_M1_42
+
+sgemm_kernel_L2_M1_100:
+
+	SAVE1x2
+
+
+sgemm_kernel_L2_END:
+	add	origPB, origPB, origK, lsl #3					// B = B + K * 2 * 4
+
+/*********************************************************************************************/
+
+sgemm_kernel_L1_BEGIN:
+
+	mov	counterJ , origN
+	tst	counterJ , #1
+	ble	sgemm_kernel_L999 // done
+
+
+	mov	pCRow0, pC						// pCRow0 = C
+	add	pC , pCRow0 , LDC                                 // C01 is the current line, update pC to point to next
+
+	mov	pA, origPA						// pA = A
+
+
+
+sgemm_kernel_L1_M4_BEGIN:
+
+	mov	counterI, origM
+	asr 	counterI, counterI, #2					// counterI = counterI / 4
+	cmp	counterI, #0
+	ble	sgemm_kernel_L1_M2_BEGIN
+
+sgemm_kernel_L1_M4_20:
+
+	INIT4x1
+
+	mov	pB, origPB
+	asr	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M4_40
+	.align 5
+
+sgemm_kernel_L1_M4_22:
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M4_22
+
+
+sgemm_kernel_L1_M4_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M4_100
+
+sgemm_kernel_L1_M4_42:
+
+	KERNEL4x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M4_42
+
+sgemm_kernel_L1_M4_100:
+
+	SAVE4x1
+
+sgemm_kernel_L1_M4_END:
+
+	subs	counterI, counterI, #1
+	bgt	sgemm_kernel_L1_M4_20
+
+
+sgemm_kernel_L1_M2_BEGIN:
+
+	mov	counterI, origM
+	tst	counterI , #3
+	ble	sgemm_kernel_L1_END
+
+	tst	counterI, #2					// counterI = counterI / 2
+	ble	sgemm_kernel_L1_M1_BEGIN
+
+sgemm_kernel_L1_M2_20:
+
+	INIT2x1
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M2_40
+
+sgemm_kernel_L1_M2_22:
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M2_22
+
+
+sgemm_kernel_L1_M2_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M2_100
+
+sgemm_kernel_L1_M2_42:
+
+	KERNEL2x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M2_42
+
+sgemm_kernel_L1_M2_100:
+
+	SAVE2x1
+
+sgemm_kernel_L1_M2_END:
+
+
+sgemm_kernel_L1_M1_BEGIN:
+
+	tst	counterI, #1					// counterI = counterI % 2
+	ble	sgemm_kernel_L1_END
+
+sgemm_kernel_L1_M1_20:
+
+	INIT1x1
+
+	mov	pB, origPB
+	asr 	counterL , origK, #3					// counterL = counterL / 8
+	cmp	counterL , #0
+	ble	sgemm_kernel_L1_M1_40
+
+sgemm_kernel_L1_M1_22:
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M1_22
+
+
+sgemm_kernel_L1_M1_40:
+
+	ands	counterL , origK, #7					// counterL = counterL % 8
+	ble	sgemm_kernel_L1_M1_100
+
+sgemm_kernel_L1_M1_42:
+
+	KERNEL1x1_SUB
+
+	subs	counterL, counterL, #1
+	bgt	sgemm_kernel_L1_M1_42
+
+sgemm_kernel_L1_M1_100:
+
+	SAVE1x1
+
+
+sgemm_kernel_L1_END:
+
+
+sgemm_kernel_L999:
+	mov	x0, #0						// set return value
+        ldp     d8,d9,[sp,#(0*16)]
+        ldp     d10,d11,[sp,#(1*16)]
+        ldp     d12,d13,[sp,#(2*16)]
+        ldp     d14,d15,[sp,#(3*16)]
+        ldp     d16,d17,[sp,#(4*16)]
+        add     sp,sp,#(5*16)
+	ret
+
+	EPILOGUE
+
diff --git a/kernel/generic/trmmkernel_4x4.c b/kernel/generic/trmmkernel_4x4.c
new file mode 100644
index 000000000..a85828cad
--- /dev/null
+++ b/kernel/generic/trmmkernel_4x4.c
@@ -0,0 +1,875 @@
+#include "common.h"
+#include <stdbool.h>
+
+int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
+{
+
+   BLASLONG i,j,k;
+   FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
+
+   FLOAT res0_0;
+   FLOAT res0_1;
+   FLOAT res0_2;
+   FLOAT res0_3;
+
+   FLOAT res1_0;
+   FLOAT res1_1;
+   FLOAT res1_2;
+   FLOAT res1_3;
+
+   FLOAT res2_0;
+   FLOAT res2_1;
+   FLOAT res2_2;
+   FLOAT res2_3;
+
+   FLOAT res3_0;
+   FLOAT res3_1;
+   FLOAT res3_2;
+   FLOAT res3_3;
+
+   FLOAT a0;
+   FLOAT a1;
+
+   FLOAT b0;
+   FLOAT b1;
+   FLOAT b2;
+   FLOAT b3;
+
+   BLASLONG off, temp;
+
+   bool left;
+   bool transposed;
+   bool backwards;
+
+#ifdef LEFT
+   left = true;
+#else
+   left = false;
+#endif
+
+#ifdef TRANSA
+   transposed = true;
+#else
+   transposed = false;
+#endif
+
+   backwards = left != transposed;
+
+   if (!left) {
+      off = -offset;
+   }
+
+
+   for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+        C2 = C1+ldc;
+        C3 = C2+ldc;
+
+
+        if (left) {
+            off = offset;
+        }
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x4
+	{
+
+		ptrbb = bb;
+                if (backwards)
+                {
+		   ptrba += off*4; // number of values in A
+		   ptrbb += off*4; // number of values in B
+                }
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+		res2_2 = 0;
+		res2_3 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+		res3_2 = 0;
+		res3_3 = 0;
+
+                temp = backwards ? bk-off :
+                             left ? off + 4 : // number of values in A
+                                    off + 4;  // number of values in B
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+			res2_2 += a0*b2;
+			res3_2 += a0*b3;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+			res2_3 += a1*b2;
+			res3_3 += a1*b3;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+		res2_2 *= alpha;
+		res2_3 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+		res3_2 *= alpha;
+		res3_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+		C2[2] = res2_2;
+		C2[3] = res2_3;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+		C3[2] = res3_2;
+		C3[3] = res3_3;
+
+		if (!backwards) {
+                    temp = bk-off;
+                    temp = left ? temp - 4 : // number of values in A
+                                  temp - 4;  // number of values in B
+
+                    ptrba += temp*4; // number of values in A
+		    ptrbb += temp*4; // number of values in B
+                }
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+		C2 = C2+4;
+		C3 = C3+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+		res2_0 = 0;
+		res2_1 = 0;
+
+		res3_0 = 0;
+		res3_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+			res2_1 += a1*b2;
+			res3_1 += a1*b3;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		res2_0 *= alpha;
+		res2_1 *= alpha;
+
+		res3_0 *= alpha;
+		res3_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+		C2[0] = res2_0;
+		C2[1] = res2_1;
+
+		C3[0] = res3_0;
+		C3[1] = res3_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+		C2 = C2+2;
+		C3 = C3+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x4 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*4;
+#endif
+
+		res0_0 = 0;
+		res1_0 = 0;
+		res2_0 = 0;
+		res3_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+4;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+			b2 = ptrbb[2];
+			b3 = ptrbb[3];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+			res2_0 += a0*b2;
+			res3_0 += a0*b3;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+4;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		res2_0 *= alpha;
+
+		res3_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+		C2[0] = res2_0;
+
+		C3[0] = res3_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 4; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*4;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+		C2 = C2+1;
+		C3 = C3+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 4;
+#endif
+
+        k = (bk<<2);
+        bb = bb+k;
+        i = (ldc<<2);
+        C = C+i;
+    }
+
+   for (j=0; j<(bn&2); j+=2) // do the Mx2 loops 
+   {
+        C0 = C;
+        C1 = C0+ldc;
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+		off = offset;
+#endif
+
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x2
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+		res1_2 = 0;
+		res1_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+			res1_2 += a0*b1;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+			res1_3 += a1*b1;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+		res1_2 *= alpha;
+		res1_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+		C1[2] = res1_2;
+		C1[3] = res1_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+		C1 = C1+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+		res1_0 = 0;
+		res1_1 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+			res1_1 += a1*b1;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		res1_0 *= alpha;
+		res1_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+		C1[0] = res1_0;
+		C1[1] = res1_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+		C1 = C1+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x2 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*2;
+#endif
+
+		res0_0 = 0;
+
+		res1_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+2;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+			b1 = ptrbb[1];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+			res1_0 += a0*b1;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+2;
+                }
+
+		res0_0 *= alpha;
+
+		res1_0 *= alpha;
+
+		C0[0] = res0_0;
+
+		C1[0] = res1_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 2; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*2;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+		C1 = C1+1;
+
+	}
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 2;
+#endif
+
+        k = (bk<<1);
+        bb = bb+k;
+        i = (ldc<<1);
+        C = C+i;
+    }
+
+
+
+
+
+
+
+   for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
+   {
+        C0 = C;
+
+#if defined(TRMMKERNEL) &&  defined(LEFT)
+	off = offset;
+#endif
+
+        ptrba = ba;
+
+        for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*4;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+		res0_2 = 0;
+		res0_3 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+4;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			a0 = ptrba[2];
+			res0_2 += a0*b0;
+
+			a1 = ptrba[3];
+			res0_3 += a1*b0;
+
+			ptrba = ptrba+4;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+		res0_2 *= alpha;
+		res0_3 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+		C0[2] = res0_2;
+		C0[3] = res0_3;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 4; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*4;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 4; // number of values in A
+#endif
+
+		C0 = C0+4;
+
+	}
+
+	if ( bm & 2 ) // do any 2x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*2;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+		res0_1 = 0;
+
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+2;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			a1 = ptrba[1];
+			res0_1 += a1*b0;
+
+			ptrba = ptrba+2;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+		res0_1 *= alpha;
+
+		C0[0] = res0_0;
+		C0[1] = res0_1;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 2; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*2;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 2; // number of values in A
+#endif
+
+		C0 = C0+2;
+
+	}
+
+	if ( bm & 1 ) // do any 1x1 loop
+	{
+
+#if (defined(LEFT) &&  defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		ptrbb = bb;
+#else
+		ptrba += off*1;
+		ptrbb = bb + off*1;
+#endif
+
+		res0_0 = 0;
+
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+		temp = bk-off;
+#elif defined(LEFT)
+		temp = off+1;	// number of values in A
+#else
+		temp = off+1;	// number of values in B
+#endif
+
+		for (k=0; k<temp; k++)
+                {
+			b0 = ptrbb[0];
+
+			a0 = ptrba[0];
+			res0_0 += a0*b0;
+
+			ptrba = ptrba+1;
+			ptrbb = ptrbb+1;
+                }
+
+		res0_0 *= alpha;
+
+		C0[0] = res0_0;
+
+
+#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+		temp = bk - off;
+#ifdef LEFT
+		temp -= 1; // number of values in A
+#else
+		temp -= 1; // number of values in B
+#endif
+		ptrba += temp*1;
+		ptrbb += temp*1;
+#endif
+
+#ifdef LEFT
+		off += 1; // number of values in A
+#endif
+
+		C0 = C0+1;
+
+	}
+
+
+
+#if defined(TRMMKERNEL) && !defined(LEFT)
+		off += 1;
+#endif
+
+        k = (bk<<0);
+        bb = bb+k;
+        C = C+ldc;
+   }
+   return 0;
+}
diff --git a/param.h b/param.h
index 3e20f5882..d7a427b65 100644
--- a/param.h
+++ b/param.h
@@ -2039,8 +2039,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define GEMM_DEFAULT_OFFSET_B 0
 #define GEMM_DEFAULT_ALIGN 0x03fffUL
 
-#define SGEMM_DEFAULT_UNROLL_M  2
-#define SGEMM_DEFAULT_UNROLL_N  2
+#define SGEMM_DEFAULT_UNROLL_M  4
+#define SGEMM_DEFAULT_UNROLL_N  4
 
 #define DGEMM_DEFAULT_UNROLL_M  2
 #define DGEMM_DEFAULT_UNROLL_N  2