diff --git a/kernel/x86_64/sgemm_direct_performant.c b/kernel/x86_64/sgemm_direct_performant.c
new file mode 100644
index 000000000..5a20ce395
--- /dev/null
+++ b/kernel/x86_64/sgemm_direct_performant.c
@@ -0,0 +1,30 @@
+#include "common.h"
+/* helper for the direct sgemm code written by Arjan van der Ven */
+
+
+
+
+int CNAME(BLASLONG M, BLASLONG N, BLASLONG K)
+{
+	unsigned long long mnk = M * N * K;
+	/* large matrixes -> not performant */
+	if (mnk >= 28 * 512 * 512)
+		return 0;
+
+	/*
+	 * if the B matrix is not a nice multiple if 4 we get many unaligned accesses,
+	 * and the regular sgemm copy/realignment of data pays off much quicker
+	 */
+	if ((N & 3) != 0 && (mnk >= 8 * 512 * 512))
+		return 0;
+
+#ifdef SMP
+	/* if we can run multithreaded, the threading changes the based threshold */
+	if (mnk > 2 * 350 * 512 && num_cpu_avail(3)> 1)
+		return 0;
+#endif
+
+	return 1;
+}
+
+
diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c
index 0e8f1318f..a7cddbb3d 100644
--- a/kernel/x86_64/sgemm_direct_skylakex.c
+++ b/kernel/x86_64/sgemm_direct_skylakex.c
@@ -1,7 +1,7 @@
-
+#if defined(SKYLAKEX) || defined (COOPERLAKE)
 /* the direct sgemm code written by Arjan van der Ven */
-//#include <immintrin.h>
-
+#include <immintrin.h>
+#include "common.h"
 /*
  * "Direct sgemm" code. This code operates directly on the inputs and outputs
  * of the sgemm call, avoiding the copies, memory realignments and threading,
@@ -38,6 +38,7 @@
 #define MATMUL_SCALAR(N,M) result##N##M +=  Aval##M * Bval##N;
 #define STORE_SCALAR(N,M)  R[(i+M) * strideR + j + N] = result##N##M;
 
+#if 0
 int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 {
 	unsigned long long mnk = M * N * K;
@@ -61,9 +62,10 @@ int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K)
 	return 1;
 }
 
+#endif
 
-
-void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
+//void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
+void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
 {
 	int i, j, k;
 
@@ -465,3 +467,8 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict
 		}
 	}
 }
+#else
+#include "common.h"
+void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR)
+{}
+#endif