Merge pull request #887 from ksraste/develop
STRSM optimization for MIPS P5600 and I6400 using MSA
This commit is contained in:
@@ -160,3 +160,4 @@ In chronological order:
|
|||||||
|
|
||||||
* Kaustubh Raste <https://github.com/ksraste/>
|
* Kaustubh Raste <https://github.com/ksraste/>
|
||||||
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
|
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
|
||||||
|
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA
|
||||||
|
|||||||
@@ -113,10 +113,10 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
|||||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||||
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
|
||||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
|
||||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
|
||||||
|
|
||||||
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
|
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
|
||||||
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
|
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
|
||||||
|
|||||||
@@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||||||
|
|
||||||
for (j = (n >> 2); j--;)
|
for (j = (n >> 2); j--;)
|
||||||
{
|
{
|
||||||
kk = m;
|
kk = m + offset;
|
||||||
|
|
||||||
if (m & 7)
|
if (m & 7)
|
||||||
{
|
{
|
||||||
@@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||||||
{
|
{
|
||||||
if (n & 2)
|
if (n & 2)
|
||||||
{
|
{
|
||||||
kk = m;
|
kk = m + offset;
|
||||||
|
|
||||||
if (m & 7)
|
if (m & 7)
|
||||||
{
|
{
|
||||||
@@ -1291,7 +1291,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||||||
|
|
||||||
if (n & 1)
|
if (n & 1)
|
||||||
{
|
{
|
||||||
kk = m;
|
kk = m + offset;
|
||||||
|
|
||||||
if (m & 7)
|
if (m & 7)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -1182,7 +1182,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||||||
|
|
||||||
for (j = (n >> 2); j--;)
|
for (j = (n >> 2); j--;)
|
||||||
{
|
{
|
||||||
kk = 0;
|
kk = offset;
|
||||||
aa = a;
|
aa = a;
|
||||||
cc = c;
|
cc = c;
|
||||||
|
|
||||||
@@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||||||
{
|
{
|
||||||
if (n & 2)
|
if (n & 2)
|
||||||
{
|
{
|
||||||
kk = 0;
|
kk = offset;
|
||||||
aa = a;
|
aa = a;
|
||||||
cc = c;
|
cc = c;
|
||||||
|
|
||||||
@@ -1282,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||||||
|
|
||||||
if (n & 1)
|
if (n & 1)
|
||||||
{
|
{
|
||||||
kk = 0;
|
kk = offset;
|
||||||
aa = a;
|
aa = a;
|
||||||
cc = c;
|
cc = c;
|
||||||
|
|
||||||
|
|||||||
@@ -809,7 +809,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||||||
BLASLONG i, j, kk;
|
BLASLONG i, j, kk;
|
||||||
FLOAT *aa, *cc;
|
FLOAT *aa, *cc;
|
||||||
|
|
||||||
kk = 0;
|
kk = -offset;
|
||||||
|
|
||||||
for (j = (n >> 2); j--;)
|
for (j = (n >> 2); j--;)
|
||||||
{
|
{
|
||||||
|
|||||||
@@ -865,7 +865,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||||||
BLASLONG i, j, kk;
|
BLASLONG i, j, kk;
|
||||||
FLOAT *aa, *cc, *bb;
|
FLOAT *aa, *cc, *bb;
|
||||||
|
|
||||||
kk = n;
|
kk = n - offset;
|
||||||
c += n * ldc;
|
c += n * ldc;
|
||||||
b += n * k;
|
b += n * k;
|
||||||
|
|
||||||
|
|||||||
@@ -137,6 +137,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||||||
}
|
}
|
||||||
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
|
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
|
||||||
|
|
||||||
|
/* Description : Indexed word element values are replicated to all
|
||||||
|
elements in output vector
|
||||||
|
Arguments : Inputs - in, stidx
|
||||||
|
Outputs - out0, out1
|
||||||
|
Return Type - as per RTYPE
|
||||||
|
Details : 'stidx' element value from 'in' vector is replicated to all
|
||||||
|
elements in 'out0' vector
|
||||||
|
'stidx + 1' element value from 'in' vector is replicated to all
|
||||||
|
elements in 'out1' vector
|
||||||
|
Valid index range for word operation is 0-3
|
||||||
|
*/
|
||||||
|
#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
|
||||||
|
{ \
|
||||||
|
out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
|
||||||
|
out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
|
||||||
|
}
|
||||||
|
|
||||||
|
#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
|
||||||
|
{ \
|
||||||
|
SPLATI_W2(RTYPE, in, 0, out0, out1); \
|
||||||
|
SPLATI_W2(RTYPE, in, 2, out2, out3); \
|
||||||
|
}
|
||||||
|
#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
|
||||||
|
|
||||||
/* Description : Transpose 4x4 block with word elements in vectors
|
/* Description : Transpose 4x4 block with word elements in vectors
|
||||||
Arguments : Inputs - in0, in1, in2, in3
|
Arguments : Inputs - in0, in1, in2, in3
|
||||||
Outputs - out0, out1, out2, out3
|
Outputs - out0, out1, out2, out3
|
||||||
|
|||||||
2133
kernel/mips/strsm_kernel_LN_8x8_msa.c
Normal file
2133
kernel/mips/strsm_kernel_LN_8x8_msa.c
Normal file
File diff suppressed because it is too large
Load Diff
2099
kernel/mips/strsm_kernel_LT_8x8_msa.c
Normal file
2099
kernel/mips/strsm_kernel_LT_8x8_msa.c
Normal file
File diff suppressed because it is too large
Load Diff
2162
kernel/mips/strsm_kernel_RN_8x8_msa.c
Normal file
2162
kernel/mips/strsm_kernel_RN_8x8_msa.c
Normal file
File diff suppressed because it is too large
Load Diff
2118
kernel/mips/strsm_kernel_RT_8x8_msa.c
Normal file
2118
kernel/mips/strsm_kernel_RT_8x8_msa.c
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user