Merge pull request #887 from ksraste/develop
STRSM optimization for MIPS P5600 and I6400 using MSA
This commit is contained in:
@@ -160,3 +160,4 @@ In chronological order:
|
||||
|
||||
* Kaustubh Raste <https://github.com/ksraste/>
|
||||
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
|
||||
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA
|
||||
|
||||
@@ -113,10 +113,10 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
|
||||
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
|
||||
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
|
||||
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
|
||||
|
||||
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
|
||||
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
|
||||
|
||||
@@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
kk = m;
|
||||
kk = m + offset;
|
||||
|
||||
if (m & 7)
|
||||
{
|
||||
@@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
||||
{
|
||||
if (n & 2)
|
||||
{
|
||||
kk = m;
|
||||
kk = m + offset;
|
||||
|
||||
if (m & 7)
|
||||
{
|
||||
@@ -1291,7 +1291,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
kk = m;
|
||||
kk = m + offset;
|
||||
|
||||
if (m & 7)
|
||||
{
|
||||
|
||||
@@ -1182,7 +1182,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
kk = 0;
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
@@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
||||
{
|
||||
if (n & 2)
|
||||
{
|
||||
kk = 0;
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
@@ -1282,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
kk = 0;
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
|
||||
@@ -809,7 +809,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
||||
BLASLONG i, j, kk;
|
||||
FLOAT *aa, *cc;
|
||||
|
||||
kk = 0;
|
||||
kk = -offset;
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
|
||||
@@ -865,7 +865,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
||||
BLASLONG i, j, kk;
|
||||
FLOAT *aa, *cc, *bb;
|
||||
|
||||
kk = n;
|
||||
kk = n - offset;
|
||||
c += n * ldc;
|
||||
b += n * k;
|
||||
|
||||
|
||||
@@ -137,6 +137,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
}
|
||||
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
|
||||
|
||||
/* Description : Indexed word element values are replicated to all
|
||||
elements in output vector
|
||||
Arguments : Inputs - in, stidx
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : 'stidx' element value from 'in' vector is replicated to all
|
||||
elements in 'out0' vector
|
||||
'stidx + 1' element value from 'in' vector is replicated to all
|
||||
elements in 'out1' vector
|
||||
Valid index range for word operation is 0-3
|
||||
*/
|
||||
#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
|
||||
out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
|
||||
}
|
||||
|
||||
#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
|
||||
{ \
|
||||
SPLATI_W2(RTYPE, in, 0, out0, out1); \
|
||||
SPLATI_W2(RTYPE, in, 2, out2, out3); \
|
||||
}
|
||||
#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
|
||||
|
||||
/* Description : Transpose 4x4 block with word elements in vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1, out2, out3
|
||||
|
||||
2133
kernel/mips/strsm_kernel_LN_8x8_msa.c
Normal file
2133
kernel/mips/strsm_kernel_LN_8x8_msa.c
Normal file
File diff suppressed because it is too large
Load Diff
2099
kernel/mips/strsm_kernel_LT_8x8_msa.c
Normal file
2099
kernel/mips/strsm_kernel_LT_8x8_msa.c
Normal file
File diff suppressed because it is too large
Load Diff
2162
kernel/mips/strsm_kernel_RN_8x8_msa.c
Normal file
2162
kernel/mips/strsm_kernel_RN_8x8_msa.c
Normal file
File diff suppressed because it is too large
Load Diff
2118
kernel/mips/strsm_kernel_RT_8x8_msa.c
Normal file
2118
kernel/mips/strsm_kernel_RT_8x8_msa.c
Normal file
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user