From ff42e68652fbba58936c9c66d0b060c3a6d694e7 Mon Sep 17 00:00:00 2001 From: Qiyu8 Date: Mon, 20 Jan 2020 11:49:42 +0800 Subject: [PATCH] Optimize genenal Gemm Beta --- kernel/generic/gemm_beta.c | 132 ++++++++++++------------------------- 1 file changed, 42 insertions(+), 90 deletions(-) diff --git a/kernel/generic/gemm_beta.c b/kernel/generic/gemm_beta.c index c4e4f7abe..fa9d7680d 100644 --- a/kernel/generic/gemm_beta.c +++ b/kernel/generic/gemm_beta.c @@ -42,101 +42,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta, FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, FLOAT *c, BLASLONG ldc){ + BLASLONG i, j; + BLASLONG chunk, remain; FLOAT *c_offset1, *c_offset; - FLOAT ctemp1, ctemp2, ctemp3, ctemp4; - FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - c_offset = c; - + chunk = m >> 3; + remain = m & 7; if (beta == ZERO){ - - j = n; - do { - c_offset1 = c_offset; - c_offset += ldc; - - i = (m >> 3); - if (i > 0){ - do { - *(c_offset1 + 0) = ZERO; - *(c_offset1 + 1) = ZERO; - *(c_offset1 + 2) = ZERO; - *(c_offset1 + 3) = ZERO; - *(c_offset1 + 4) = ZERO; - *(c_offset1 + 5) = ZERO; - *(c_offset1 + 6) = ZERO; - *(c_offset1 + 7) = ZERO; - c_offset1 += 8; - i --; - } while (i > 0); - } - - i = (m & 7); - if (i > 0){ - do { - *c_offset1 = ZERO; - c_offset1 ++; - i --; - } while (i > 0); - } - j --; - } while (j > 0); - + for(j=n; j>0; j--){ + c_offset1 = c_offset; + c_offset += ldc; + for(i=chunk; i>0; i--){ + *(c_offset1 + 0) = ZERO; + *(c_offset1 + 1) = ZERO; + *(c_offset1 + 2) = ZERO; + *(c_offset1 + 3) = ZERO; + *(c_offset1 + 4) = ZERO; + *(c_offset1 + 5) = ZERO; + *(c_offset1 + 6) = ZERO; + *(c_offset1 + 7) = ZERO; + c_offset1 += 8; + } + for(i=remain; i>0; i--){ + *c_offset1 = ZERO; + c_offset1 ++; + } + } } else { - - j = n; - do { - c_offset1 = c_offset; - c_offset += ldc; - - i = (m >> 3); - if (i > 0){ - do { - ctemp1 = *(c_offset1 + 0); - ctemp2 = *(c_offset1 + 1); - ctemp3 = *(c_offset1 + 2); - ctemp4 = *(c_offset1 + 3); - ctemp5 = *(c_offset1 + 4); - ctemp6 = *(c_offset1 + 5); - ctemp7 = *(c_offset1 + 6); - ctemp8 = *(c_offset1 + 7); - - ctemp1 *= beta; - ctemp2 *= beta; - ctemp3 *= beta; - ctemp4 *= beta; - ctemp5 *= beta; - ctemp6 *= beta; - ctemp7 *= beta; - ctemp8 *= beta; - - *(c_offset1 + 0) = ctemp1; - *(c_offset1 + 1) = ctemp2; - *(c_offset1 + 2) = ctemp3; - *(c_offset1 + 3) = ctemp4; - *(c_offset1 + 4) = ctemp5; - *(c_offset1 + 5) = ctemp6; - *(c_offset1 + 6) = ctemp7; - *(c_offset1 + 7) = ctemp8; - c_offset1 += 8; - i --; - } while (i > 0); - } - - i = (m & 7); - if (i > 0){ - do { - ctemp1 = *c_offset1; - ctemp1 *= beta; - *c_offset1 = ctemp1; - c_offset1 ++; - i --; - } while (i > 0); - } - j --; - } while (j > 0); - + for(j=n; j>0; j--){ + c_offset1 = c_offset; + c_offset += ldc; + for(i=chunk; i>0; i--){ + *(c_offset1 + 0) *= beta; + *(c_offset1 + 1) *= beta; + *(c_offset1 + 2) *= beta; + *(c_offset1 + 3) *= beta; + *(c_offset1 + 4) *= beta; + *(c_offset1 + 5) *= beta; + *(c_offset1 + 6) *= beta; + *(c_offset1 + 7) *= beta; + c_offset1 += 8; + } + for(i=remain; i>0; i--){ + *c_offset1 *= beta; + c_offset1 ++; + } + } } return 0; };