Optimize genenal Gemm Beta

This commit is contained in:
Qiyu8 2020-01-20 11:49:42 +08:00
parent 093d37de8d
commit ff42e68652
1 changed files with 42 additions and 90 deletions

View File

@ -42,101 +42,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5, FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
FLOAT *c, BLASLONG ldc){ FLOAT *c, BLASLONG ldc){
BLASLONG i, j; BLASLONG i, j;
BLASLONG chunk, remain;
FLOAT *c_offset1, *c_offset; FLOAT *c_offset1, *c_offset;
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
c_offset = c; c_offset = c;
chunk = m >> 3;
remain = m & 7;
if (beta == ZERO){ if (beta == ZERO){
for(j=n; j>0; j--){
j = n; c_offset1 = c_offset;
do { c_offset += ldc;
c_offset1 = c_offset; for(i=chunk; i>0; i--){
c_offset += ldc; *(c_offset1 + 0) = ZERO;
*(c_offset1 + 1) = ZERO;
i = (m >> 3); *(c_offset1 + 2) = ZERO;
if (i > 0){ *(c_offset1 + 3) = ZERO;
do { *(c_offset1 + 4) = ZERO;
*(c_offset1 + 0) = ZERO; *(c_offset1 + 5) = ZERO;
*(c_offset1 + 1) = ZERO; *(c_offset1 + 6) = ZERO;
*(c_offset1 + 2) = ZERO; *(c_offset1 + 7) = ZERO;
*(c_offset1 + 3) = ZERO; c_offset1 += 8;
*(c_offset1 + 4) = ZERO; }
*(c_offset1 + 5) = ZERO; for(i=remain; i>0; i--){
*(c_offset1 + 6) = ZERO; *c_offset1 = ZERO;
*(c_offset1 + 7) = ZERO; c_offset1 ++;
c_offset1 += 8; }
i --; }
} while (i > 0);
}
i = (m & 7);
if (i > 0){
do {
*c_offset1 = ZERO;
c_offset1 ++;
i --;
} while (i > 0);
}
j --;
} while (j > 0);
} else { } else {
for(j=n; j>0; j--){
j = n; c_offset1 = c_offset;
do { c_offset += ldc;
c_offset1 = c_offset; for(i=chunk; i>0; i--){
c_offset += ldc; *(c_offset1 + 0) *= beta;
*(c_offset1 + 1) *= beta;
i = (m >> 3); *(c_offset1 + 2) *= beta;
if (i > 0){ *(c_offset1 + 3) *= beta;
do { *(c_offset1 + 4) *= beta;
ctemp1 = *(c_offset1 + 0); *(c_offset1 + 5) *= beta;
ctemp2 = *(c_offset1 + 1); *(c_offset1 + 6) *= beta;
ctemp3 = *(c_offset1 + 2); *(c_offset1 + 7) *= beta;
ctemp4 = *(c_offset1 + 3); c_offset1 += 8;
ctemp5 = *(c_offset1 + 4); }
ctemp6 = *(c_offset1 + 5); for(i=remain; i>0; i--){
ctemp7 = *(c_offset1 + 6); *c_offset1 *= beta;
ctemp8 = *(c_offset1 + 7); c_offset1 ++;
}
ctemp1 *= beta; }
ctemp2 *= beta;
ctemp3 *= beta;
ctemp4 *= beta;
ctemp5 *= beta;
ctemp6 *= beta;
ctemp7 *= beta;
ctemp8 *= beta;
*(c_offset1 + 0) = ctemp1;
*(c_offset1 + 1) = ctemp2;
*(c_offset1 + 2) = ctemp3;
*(c_offset1 + 3) = ctemp4;
*(c_offset1 + 4) = ctemp5;
*(c_offset1 + 5) = ctemp6;
*(c_offset1 + 6) = ctemp7;
*(c_offset1 + 7) = ctemp8;
c_offset1 += 8;
i --;
} while (i > 0);
}
i = (m & 7);
if (i > 0){
do {
ctemp1 = *c_offset1;
ctemp1 *= beta;
*c_offset1 = ctemp1;
c_offset1 ++;
i --;
} while (i > 0);
}
j --;
} while (j > 0);
} }
return 0; return 0;
}; };