From 0caf1434c928d39373499ffc02abe645945485d8 Mon Sep 17 00:00:00 2001 From: "Wang, Long" Date: Wed, 20 Nov 2019 11:50:37 +0800 Subject: [PATCH 1/2] Fix the integer overflow issue for large matrix size For large matrix, e.g. M=N=K, and M>1290, int mnk=M*N*K will overflow. This will lead to wrong branching to single-threading. The performance is downgraded significantly. Signed-off-by: Wang, Long --- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 2 +- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 3246e681f..31d82e3bf 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -1215,7 +1215,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, flo int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { - int mnk = M * N * K; + unsigned long mnk = M * N * K; /* large matrixes -> not performant */ if (mnk >= 28 * 512 * 512) return 0; diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index 5d491237b..95963c0ac 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { - int mnk = M * N * K; + unsigned long mnk = M * N * K; /* large matrixes -> not performant */ if (mnk >= 28 * 512 * 512) return 0; From 1191db1a49b237e7c636616cb51ca0879d01c128 Mon Sep 17 00:00:00 2001 From: "Wang, Long" Date: Wed, 20 Nov 2019 21:30:16 +0800 Subject: [PATCH 2/2] For the sake of windows compatible, used "unsigned long long" to ensure 64-bit length Signed-off-by: Wang, Long --- kernel/x86_64/sgemm_kernel_16x4_skylakex.c | 4 ++-- kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c index 31d82e3bf..4177ae2dc 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex.c @@ -762,7 +762,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. int __attribute__ ((noinline)) CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict A, float * __restrict B, float * __restrict C, BLASLONG ldc) { - unsigned long M = m, N = n, K = k; + unsigned long long M = m, N = n, K = k; if (M == 0) return 0; if (N == 0) @@ -1639,4 +1639,4 @@ void sgemm_kernel_direct (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict STORE_SCALAR(0, 0); } } -} \ No newline at end of file +} diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c index 95963c0ac..ee3417505 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_2.c @@ -452,7 +452,7 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int sgemm_kernel_direct_performant(BLASLONG M, BLASLONG N, BLASLONG K) { - unsigned long mnk = M * N * K; + unsigned long long mnk = M * N * K; /* large matrixes -> not performant */ if (mnk >= 28 * 512 * 512) return 0;