STRSM optimization for MIPS P5600 and I6400 using MSA
Signed-off-by: Kaustubh Raste <kaustubh.raste@imgtec.com>
This commit is contained in:
parent
a8fcd89d6d
commit
ad9f317870
|
@ -160,3 +160,4 @@ In chronological order:
|
|||
|
||||
* Kaustubh Raste <https://github.com/ksraste/>
|
||||
* [2016-05-09] DTRSM optimization for MIPS P5600 and I6400 using MSA
|
||||
* [2016-05-20] STRSM optimization for MIPS P5600 and I6400 using MSA
|
||||
|
|
|
@ -113,10 +113,10 @@ ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
|||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
|
||||
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
|
||||
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
|
||||
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
|
||||
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
|
||||
|
||||
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
|
||||
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
|
||||
|
|
|
@ -1170,7 +1170,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
kk = m;
|
||||
kk = m + offset;
|
||||
|
||||
if (m & 7)
|
||||
{
|
||||
|
@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||
{
|
||||
if (n & 2)
|
||||
{
|
||||
kk = m;
|
||||
kk = m + offset;
|
||||
|
||||
if (m & 7)
|
||||
{
|
||||
|
@ -1291,7 +1291,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||
|
||||
if (n & 1)
|
||||
{
|
||||
kk = m;
|
||||
kk = m + offset;
|
||||
|
||||
if (m & 7)
|
||||
{
|
||||
|
|
|
@ -1182,7 +1182,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
kk = 0;
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
|
@ -1233,7 +1233,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||
{
|
||||
if (n & 2)
|
||||
{
|
||||
kk = 0;
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
|
@ -1282,7 +1282,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||
|
||||
if (n & 1)
|
||||
{
|
||||
kk = 0;
|
||||
kk = offset;
|
||||
aa = a;
|
||||
cc = c;
|
||||
|
||||
|
|
|
@ -809,7 +809,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||
BLASLONG i, j, kk;
|
||||
FLOAT *aa, *cc;
|
||||
|
||||
kk = 0;
|
||||
kk = -offset;
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
|
|
|
@ -865,7 +865,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
|
|||
BLASLONG i, j, kk;
|
||||
FLOAT *aa, *cc, *bb;
|
||||
|
||||
kk = n;
|
||||
kk = n - offset;
|
||||
c += n * ldc;
|
||||
b += n * k;
|
||||
|
||||
|
|
|
@ -137,6 +137,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
}
|
||||
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
|
||||
|
||||
/* Description : Indexed word element values are replicated to all
|
||||
elements in output vector
|
||||
Arguments : Inputs - in, stidx
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : 'stidx' element value from 'in' vector is replicated to all
|
||||
elements in 'out0' vector
|
||||
'stidx + 1' element value from 'in' vector is replicated to all
|
||||
elements in 'out1' vector
|
||||
Valid index range for word operation is 0-3
|
||||
*/
|
||||
#define SPLATI_W2(RTYPE, in, stidx, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
|
||||
out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
|
||||
}
|
||||
|
||||
#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
|
||||
{ \
|
||||
SPLATI_W2(RTYPE, in, 0, out0, out1); \
|
||||
SPLATI_W2(RTYPE, in, 2, out2, out3); \
|
||||
}
|
||||
#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
|
||||
|
||||
/* Description : Transpose 4x4 block with word elements in vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1, out2, out3
|
||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue