Optimize genenal Gemm Beta

This commit is contained in:
Qiyu8 2020-01-20 11:49:42 +08:00
parent 093d37de8d
commit ff42e68652
1 changed files with 42 additions and 90 deletions

View File

@ -42,101 +42,53 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT beta,
FLOAT *dummy2, BLASLONG dummy3, FLOAT *dummy4, BLASLONG dummy5,
FLOAT *c, BLASLONG ldc){
BLASLONG i, j;
BLASLONG chunk, remain;
FLOAT *c_offset1, *c_offset;
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
c_offset = c;
chunk = m >> 3;
remain = m & 7;
if (beta == ZERO){
j = n;
do {
c_offset1 = c_offset;
c_offset += ldc;
i = (m >> 3);
if (i > 0){
do {
*(c_offset1 + 0) = ZERO;
*(c_offset1 + 1) = ZERO;
*(c_offset1 + 2) = ZERO;
*(c_offset1 + 3) = ZERO;
*(c_offset1 + 4) = ZERO;
*(c_offset1 + 5) = ZERO;
*(c_offset1 + 6) = ZERO;
*(c_offset1 + 7) = ZERO;
c_offset1 += 8;
i --;
} while (i > 0);
}
i = (m & 7);
if (i > 0){
do {
*c_offset1 = ZERO;
c_offset1 ++;
i --;
} while (i > 0);
}
j --;
} while (j > 0);
for(j=n; j>0; j--){
c_offset1 = c_offset;
c_offset += ldc;
for(i=chunk; i>0; i--){
*(c_offset1 + 0) = ZERO;
*(c_offset1 + 1) = ZERO;
*(c_offset1 + 2) = ZERO;
*(c_offset1 + 3) = ZERO;
*(c_offset1 + 4) = ZERO;
*(c_offset1 + 5) = ZERO;
*(c_offset1 + 6) = ZERO;
*(c_offset1 + 7) = ZERO;
c_offset1 += 8;
}
for(i=remain; i>0; i--){
*c_offset1 = ZERO;
c_offset1 ++;
}
}
} else {
j = n;
do {
c_offset1 = c_offset;
c_offset += ldc;
i = (m >> 3);
if (i > 0){
do {
ctemp1 = *(c_offset1 + 0);
ctemp2 = *(c_offset1 + 1);
ctemp3 = *(c_offset1 + 2);
ctemp4 = *(c_offset1 + 3);
ctemp5 = *(c_offset1 + 4);
ctemp6 = *(c_offset1 + 5);
ctemp7 = *(c_offset1 + 6);
ctemp8 = *(c_offset1 + 7);
ctemp1 *= beta;
ctemp2 *= beta;
ctemp3 *= beta;
ctemp4 *= beta;
ctemp5 *= beta;
ctemp6 *= beta;
ctemp7 *= beta;
ctemp8 *= beta;
*(c_offset1 + 0) = ctemp1;
*(c_offset1 + 1) = ctemp2;
*(c_offset1 + 2) = ctemp3;
*(c_offset1 + 3) = ctemp4;
*(c_offset1 + 4) = ctemp5;
*(c_offset1 + 5) = ctemp6;
*(c_offset1 + 6) = ctemp7;
*(c_offset1 + 7) = ctemp8;
c_offset1 += 8;
i --;
} while (i > 0);
}
i = (m & 7);
if (i > 0){
do {
ctemp1 = *c_offset1;
ctemp1 *= beta;
*c_offset1 = ctemp1;
c_offset1 ++;
i --;
} while (i > 0);
}
j --;
} while (j > 0);
for(j=n; j>0; j--){
c_offset1 = c_offset;
c_offset += ldc;
for(i=chunk; i>0; i--){
*(c_offset1 + 0) *= beta;
*(c_offset1 + 1) *= beta;
*(c_offset1 + 2) *= beta;
*(c_offset1 + 3) *= beta;
*(c_offset1 + 4) *= beta;
*(c_offset1 + 5) *= beta;
*(c_offset1 + 6) *= beta;
*(c_offset1 + 7) *= beta;
c_offset1 += 8;
}
for(i=remain; i>0; i--){
*c_offset1 *= beta;
c_offset1 ++;
}
}
}
return 0;
};