Merge pull request #887 from ksraste/develop

STRSM optimization for MIPS P5600 and I6400 using MSA
This commit is contained in:
Zhang Xianyi 2016-05-21 07:17:21 +08:00
commit b46f680f01
11 changed files with 8549 additions and 12 deletions

View File

@ -160,3 +160,4 @@ In chronological order:
* Kaustubh Raste <https://github.com/ksraste/> * Kaustubh Raste <https://github.com/ksraste/>
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA * [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA

View File

@ -113,10 +113,10 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c

View File

@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
for (j = (n >> 2); j--;) for (j = (n >> 2); j--;)
{ {
kk = m; kk = m + offset;
if (m & 7) if (m & 7)
{ {
@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{ {
if (n & 2) if (n & 2)
{ {
kk = m; kk = m + offset;
if (m & 7) if (m & 7)
{ {
@ -1291,7 +1291,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (n & 1) if (n & 1)
{ {
kk = m; kk = m + offset;
if (m & 7) if (m & 7)
{ {

View File

@ -1182,7 +1182,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
for (j = (n >> 2); j--;) for (j = (n >> 2); j--;)
{ {
kk = 0; kk = offset;
aa = a; aa = a;
cc = c; cc = c;
@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
{ {
if (n & 2) if (n & 2)
{ {
kk = 0; kk = offset;
aa = a; aa = a;
cc = c; cc = c;
@ -1282,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
if (n & 1) if (n & 1)
{ {
kk = 0; kk = offset;
aa = a; aa = a;
cc = c; cc = c;

View File

@ -809,7 +809,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
BLASLONG i, j, kk; BLASLONG i, j, kk;
FLOAT *aa, *cc; FLOAT *aa, *cc;
kk = 0; kk = -offset;
for (j = (n >> 2); j--;) for (j = (n >> 2); j--;)
{ {

View File

@ -865,7 +865,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
BLASLONG i, j, kk; BLASLONG i, j, kk;
FLOAT *aa, *cc, *bb; FLOAT *aa, *cc, *bb;
kk = n; kk = n - offset;
c += n * ldc; c += n * ldc;
b += n * k; b += n * k;

View File

@ -137,6 +137,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
} }
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__) #define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
/* Description : Indexed word element values are replicated to all
elements in output vector
Arguments : Inputs - in, stidx
Outputs - out0, out1
Return Type - as per RTYPE
Details : 'stidx' element value from 'in' vector is replicated to all
elements in 'out0' vector
'stidx + 1' element value from 'in' vector is replicated to all
elements in 'out1' vector
Valid index range for word operation is 0-3
*/
#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
{ \
out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
}
#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
{ \
SPLATI_W2(RTYPE, in, 0, out0, out1); \
SPLATI_W2(RTYPE, in, 2, out2, out3); \
}
#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
/* Description : Transpose 4x4 block with word elements in vectors /* Description : Transpose 4x4 block with word elements in vectors
Arguments : Inputs - in0, in1, in2, in3 Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3 Outputs - out0, out1, out2, out3

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff