generic: Bug fixes in generic 4x2 and 4x4 gemm kernels

This commit is contained in:
Ashwin Sekhar T K 2017-07-02 02:00:48 +05:30
parent 8f83d3f961
commit eda9e8632a
2 changed files with 64 additions and 64 deletions

View File

@ -154,11 +154,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_0 *= alpha; res1_0 *= alpha;
res1_1 *= alpha; res1_1 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0[1] = res0_1; C0[1] += res0_1;
C1[0] = res1_0; C1[0] += res1_0;
C1[1] = res1_1; C1[1] += res1_1;
C0 = C0+2; C0 = C0+2;
C1 = C1+2; C1 = C1+2;
@ -190,12 +190,12 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_0 *= alpha; res1_0 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C1[0] = res1_0; C1[0] += res1_0;
C0 = C0+1; C0 += C0+1;
C1 = C1+1; C1 += C1+1;
} }
@ -245,10 +245,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_2 *= alpha; res0_2 *= alpha;
res0_3 *= alpha; res0_3 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0[1] = res0_1; C0[1] += res0_1;
C0[2] = res0_2; C0[2] += res0_2;
C0[3] = res0_3; C0[3] += res0_3;
C0 = C0+4; C0 = C0+4;
@ -278,8 +278,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_0 *= alpha; res0_0 *= alpha;
res0_1 *= alpha; res0_1 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0[1] = res0_1; C0[1] += res0_1;
C0 = C0+2; C0 = C0+2;
@ -306,7 +306,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
C0[0] = res0_0; C0[0] = res0_0;
C0 = C0+1; C0 += C0+1;
} }
k = (bk<<0); k = (bk<<0);

View File

@ -152,25 +152,25 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res3_2 *= alpha; res3_2 *= alpha;
res3_3 *= alpha; res3_3 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0[1] = res0_1; C0[1] += res0_1;
C0[2] = res0_2; C0[2] += res0_2;
C0[3] = res0_3; C0[3] += res0_3;
C1[0] = res1_0; C1[0] += res1_0;
C1[1] = res1_1; C1[1] += res1_1;
C1[2] = res1_2; C1[2] += res1_2;
C1[3] = res1_3; C1[3] += res1_3;
C2[0] = res2_0; C2[0] += res2_0;
C2[1] = res2_1; C2[1] += res2_1;
C2[2] = res2_2; C2[2] += res2_2;
C2[3] = res2_3; C2[3] += res2_3;
C3[0] = res3_0; C3[0] += res3_0;
C3[1] = res3_1; C3[1] += res3_1;
C3[2] = res3_2; C3[2] += res3_2;
C3[3] = res3_3; C3[3] += res3_3;
C0 = C0+4; C0 = C0+4;
C1 = C1+4; C1 = C1+4;
@ -230,17 +230,17 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res3_0 *= alpha; res3_0 *= alpha;
res3_1 *= alpha; res3_1 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0[1] = res0_1; C0[1] += res0_1;
C1[0] = res1_0; C1[0] += res1_0;
C1[1] = res1_1; C1[1] += res1_1;
C2[0] = res2_0; C2[0] += res2_0;
C2[1] = res2_1; C2[1] += res2_1;
C3[0] = res3_0; C3[0] += res3_0;
C3[1] = res3_1; C3[1] += res3_1;
C0 = C0+2; C0 = C0+2;
C1 = C1+2; C1 = C1+2;
@ -283,13 +283,13 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res3_0 *= alpha; res3_0 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C1[0] = res1_0; C1[0] += res1_0;
C2[0] = res2_0; C2[0] += res2_0;
C3[0] = res3_0; C3[0] += res3_0;
C0 = C0+1; C0 = C0+1;
C1 = C1+1; C1 = C1+1;
@ -360,15 +360,15 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_2 *= alpha; res1_2 *= alpha;
res1_3 *= alpha; res1_3 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0[1] = res0_1; C0[1] += res0_1;
C0[2] = res0_2; C0[2] += res0_2;
C0[3] = res0_3; C0[3] += res0_3;
C1[0] = res1_0; C1[0] += res1_0;
C1[1] = res1_1; C1[1] += res1_1;
C1[2] = res1_2; C1[2] += res1_2;
C1[3] = res1_3; C1[3] += res1_3;
C0 = C0+4; C0 = C0+4;
C1 = C1+4; C1 = C1+4;
@ -408,11 +408,11 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_0 *= alpha; res1_0 *= alpha;
res1_1 *= alpha; res1_1 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0[1] = res0_1; C0[1] += res0_1;
C1[0] = res1_0; C1[0] += res1_0;
C1[1] = res1_1; C1[1] += res1_1;
C0 = C0+2; C0 = C0+2;
C1 = C1+2; C1 = C1+2;
@ -444,9 +444,9 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res1_0 *= alpha; res1_0 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C1[0] = res1_0; C1[0] += res1_0;
C0 = C0+1; C0 = C0+1;
C1 = C1+1; C1 = C1+1;
@ -499,10 +499,10 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_2 *= alpha; res0_2 *= alpha;
res0_3 *= alpha; res0_3 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0[1] = res0_1; C0[1] += res0_1;
C0[2] = res0_2; C0[2] += res0_2;
C0[3] = res0_3; C0[3] += res0_3;
C0 = C0+4; C0 = C0+4;
@ -532,8 +532,8 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_0 *= alpha; res0_0 *= alpha;
res0_1 *= alpha; res0_1 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0[1] = res0_1; C0[1] += res0_1;
C0 = C0+2; C0 = C0+2;
@ -558,7 +558,7 @@ int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FL
res0_0 *= alpha; res0_0 *= alpha;
C0[0] = res0_0; C0[0] += res0_0;
C0 = C0+1; C0 = C0+1;