From dcc5d6291e7b02761acfb6161c04ba1f8f25b502 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Thu, 1 Nov 2018 01:42:09 +0000 Subject: [PATCH] skylakex: Make the sgemm/dgemm beta code robust for a N=0 or M=0 case in the threading code there are cases where N or M can become 0, and the optimized beta code did not handle this well, leading to a crash during the audit for the crash a few edge conditions on the if statements were found and fixed as well --- kernel/x86_64/dgemm_beta_skylakex.c | 6 ++++-- kernel/x86_64/sgemm_beta_skylakex.c | 6 ++++-- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/x86_64/dgemm_beta_skylakex.c b/kernel/x86_64/dgemm_beta_skylakex.c index 384e9f60b..6a824c9b5 100644 --- a/kernel/x86_64/dgemm_beta_skylakex.c +++ b/kernel/x86_64/dgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (m == 0 || n == 0) + return 0; c_offset = c; @@ -69,7 +71,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_pd(c_offset1, z_zero); _mm512_storeu_pd(c_offset1 + 8, z_zero); _mm512_storeu_pd(c_offset1 + 16, z_zero); @@ -77,7 +79,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm512_storeu_pd(c_offset1, z_zero); c_offset1 += 8; i -= 8; diff --git a/kernel/x86_64/sgemm_beta_skylakex.c b/kernel/x86_64/sgemm_beta_skylakex.c index 54f9664e9..4e40acadf 100644 --- a/kernel/x86_64/sgemm_beta_skylakex.c +++ b/kernel/x86_64/sgemm_beta_skylakex.c @@ -55,6 +55,8 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, return 0; } + if (n == 0 || m == 0) + return; c_offset = c; @@ -71,13 +73,13 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, i = m; - while (i > 32) { + while (i >= 32) { _mm512_storeu_ps(c_offset1, z_zero); _mm512_storeu_ps(c_offset1 + 16, z_zero); c_offset1 += 32; i -= 32; } - while (i > 8) { + while (i >= 8) { _mm256_storeu_ps(c_offset1, y_zero); c_offset1 += 8; i -= 8;