SGEMM optimization for MIPS P5600 and I6400 using MSA. Unrolled k loop in DGEMM kernel function

Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
This commit is contained in:
Shivraj Patil 2016-05-19 11:04:42 +05:30
parent 7a19065369
commit c4ba40e308
7 changed files with 2755 additions and 21 deletions

View File

@ -85,11 +85,11 @@ DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c
SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c
DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c

View File

@ -90,7 +90,70 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 8;
pb0 += 4;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
res2 += src_a2 * src_b;
res3 += src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res4 += src_a0 * src_b;
res5 += src_a1 * src_b;
res6 += src_a2 * src_b;
res7 += src_a3 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
res8 += src_a0 * src_b;
res9 += src_a1 * src_b;
res10 += src_a2 * src_b;
res11 += src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res12 += src_a0 * src_b;
res13 += src_a1 * src_b;
res14 += src_a2 * src_b;
res15 += src_a3 * src_b;
pa0 += 8;
pb0 += 4;
LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
res2 += src_a2 * src_b;
res3 += src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res4 += src_a0 * src_b;
res5 += src_a1 * src_b;
res6 += src_a2 * src_b;
res7 += src_a3 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
res8 += src_a0 * src_b;
res9 += src_a1 * src_b;
res10 += src_a2 * src_b;
res11 += src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res12 += src_a0 * src_b;
res13 += src_a1 * src_b;
res14 += src_a2 * src_b;
res15 += src_a3 * src_b;
pa0 += 8;
pb0 += 4;
}
if ((k - 1) & 1)
{
LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
LD_DP2(pb0, 2, src_b0, src_b1);
@ -185,7 +248,54 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 4;
pb0 += 4;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
LD_DP2(pa0, 2, src_a0, src_a1);
LD_DP2(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
res4 += src_a0 * src_b;
res5 += src_a1 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res6 += src_a0 * src_b;
res7 += src_a1 * src_b;
pa0 += 4;
pb0 += 4;
LD_DP2(pa0, 2, src_a0, src_a1);
LD_DP2(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
res4 += src_a0 * src_b;
res5 += src_a1 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res6 += src_a0 * src_b;
res7 += src_a1 * src_b;
pa0 += 4;
pb0 += 4;
}
if ((k - 1) & 1)
{
LD_DP2(pa0, 2, src_a0, src_a1);
LD_DP2(pb0, 2, src_b0, src_b1);
@ -257,7 +367,46 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 2;
pb0 += 4;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
src_a0 = LD_DP(pa0);
LD_DP2(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res1 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
res2 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res3 += src_a0 * src_b;
pa0 += 2;
pb0 += 4;
src_a0 = LD_DP(pa0);
LD_DP2(pb0, 2, src_b0, src_b1);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res1 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1);
res2 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1);
res3 += src_a0 * src_b;
pa0 += 2;
pb0 += 4;
}
if ((k - 1) & 1)
{
src_a0 = LD_DP(pa0);
LD_DP2(pb0, 2, src_b0, src_b1);
@ -319,7 +468,42 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 1;
pb0 += 4;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
tmp0 += a0 * b0;
b1 = pb0[1];
tmp1 += a0 * b1;
b2 = pb0[2];
tmp2 += a0 * b2;
b3 = pb0[3];
tmp3 += a0 * b3;
pa0 += 1;
pb0 += 4;
a0 = pa0[0];
b0 = pb0[0];
tmp0 += a0 * b0;
b1 = pb0[1];
tmp1 += a0 * b1;
b2 = pb0[2];
tmp2 += a0 * b2;
b3 = pb0[3];
tmp3 += a0 * b3;
pa0 += 1;
pb0 += 4;
}
if ((k - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
@ -389,7 +573,46 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 8;
pb0 += 2;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(pb0);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
res2 += src_a2 * src_b;
res3 += src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res4 += src_a0 * src_b;
res5 += src_a1 * src_b;
res6 += src_a2 * src_b;
res7 += src_a3 * src_b;
pa0 += 8;
pb0 += 2;
LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(pb0);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
res2 += src_a2 * src_b;
res3 += src_a3 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res4 += src_a0 * src_b;
res5 += src_a1 * src_b;
res6 += src_a2 * src_b;
res7 += src_a3 * src_b;
pa0 += 8;
pb0 += 2;
}
if ((k - 1) & 1)
{
LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b0 = LD_DP(pb0);
@ -447,7 +670,38 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 4;
pb0 += 2;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
LD_DP2(pa0, 2, src_a0, src_a1);
src_b0 = LD_DP(pb0);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
pa0 += 4;
pb0 += 2;
LD_DP2(pa0, 2, src_a0, src_a1);
src_b0 = LD_DP(pb0);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res2 += src_a0 * src_b;
res3 += src_a1 * src_b;
pa0 += 4;
pb0 += 2;
}
if ((k - 1) & 1)
{
LD_DP2(pa0, 2, src_a0, src_a1);
src_b0 = LD_DP(pb0);
@ -495,7 +749,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 2;
pb0 += 2;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
src_a0 = LD_DP(pa0);
src_b0 = LD_DP(pb0);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res1 += src_a0 * src_b;
pa0 += 2;
pb0 += 2;
src_a0 = LD_DP(pa0);
src_b0 = LD_DP(pb0);
src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0);
res0 += src_a0 * src_b;
src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0);
res1 += src_a0 * src_b;
pa0 += 2;
pb0 += 2;
}
if ((k - 1) & 1)
{
src_a0 = LD_DP(pa0);
src_b0 = LD_DP(pb0);
@ -537,7 +818,30 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 1;
pb0 += 2;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
tmp0 += a0 * b0;
b1 = pb0[1];
tmp1 += a0 * b1;
pa0 += 1;
pb0 += 2;
a0 = pa0[0];
b0 = pb0[0];
tmp0 += a0 * b0;
b1 = pb0[1];
tmp1 += a0 * b1;
pa0 += 1;
pb0 += 2;
}
if ((k - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];
@ -587,7 +891,34 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 8;
pb0 += 1;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
res2 += src_a2 * src_b;
res3 += src_a3 * src_b;
pa0 += 8;
pb0 += 1;
LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
res2 += src_a2 * src_b;
res3 += src_a3 * src_b;
pa0 += 8;
pb0 += 1;
}
if ((k - 1) & 1)
{
LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3);
src_b[0] = pb0[0];
@ -628,7 +959,30 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 4;
pb0 += 1;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
LD_DP2(pa0, 2, src_a0, src_a1);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
pa0 += 4;
pb0 += 1;
LD_DP2(pa0, 2, src_a0, src_a1);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 += src_a0 * src_b;
res1 += src_a1 * src_b;
pa0 += 4;
pb0 += 1;
}
if ((k - 1) & 1)
{
LD_DP2(pa0, 2, src_a0, src_a1);
src_b[0] = pb0[0];
@ -664,7 +1018,28 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 2;
pb0 += 1;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
src_a0 = LD_DP(pa0);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 += src_a0 * src_b;
pa0 += 2;
pb0 += 1;
src_a0 = LD_DP(pa0);
src_b[0] = pb0[0];
src_b[1] = pb0[0];
res0 += src_a0 * src_b;
pa0 += 2;
pb0 += 1;
}
if ((k - 1) & 1)
{
src_a0 = LD_DP(pa0);
src_b[0] = pb0[0];
@ -696,7 +1071,24 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B,
pa0 += 1;
pb0 += 1;
for (l = (k - 1); l--;)
for (l = ((k - 1) / 2); l--;)
{
a0 = pa0[0];
b0 = pb0[0];
tmp0 += a0 * b0;
pa0 += 1;
pb0 += 1;
a0 = pa0[0];
b0 = pb0[0];
tmp0 += a0 * b0;
pa0 += 1;
pb0 += 1;
}
if ((k - 1) & 1)
{
a0 = pa0[0];
b0 = pb0[0];

View File

@ -30,12 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#include <msa.h>
#define LD_W(RTYPE, psrc) *((RTYPE *)(psrc))
#define LD_SP(...) LD_W(v4f32, __VA_ARGS__)
#define LD_D(RTYPE, psrc) *((RTYPE *)(psrc))
#define LD_DP(...) LD_D(v2f64, __VA_ARGS__)
#define ST_W(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_SP(...) ST_W(v4f32, __VA_ARGS__)
#define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
#define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
/* Description : Load 2 vectors of single precision floating point elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
Return Type - single precision floating point
*/
#define LD_SP2(psrc, stride, out0, out1) \
{ \
out0 = LD_SP((psrc)); \
out1 = LD_SP((psrc) + stride); \
}
/* Description : Load 2 vectors of double precision floating point elements with stride
Arguments : Inputs - psrc, stride
Outputs - out0, out1
@ -53,6 +70,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
LD_DP2(psrc + 2 * stride, stride, out2, out3) \
}
/* Description : Store vectors of single precision floating point elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 4 single precision floating point elements from 'in0' to (pdst)
Store 4 single precision floating point elements from 'in1' to (pdst + stride)
*/
#define ST_SP2(in0, in1, pdst, stride) \
{ \
ST_SP(in0, (pdst)); \
ST_SP(in1, (pdst) + stride); \
}
#define ST_SP4(in0, in1, in2, in3, pdst, stride) \
{ \
ST_SP2(in0, in1, (pdst), stride); \
ST_SP2(in2, in3, (pdst + 2 * stride), stride); \
}
#define ST_SP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
{ \
ST_SP4(in0, in1, in2, in3, (pdst), stride); \
ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \
}
/* Description : Store vectors of double precision floating point elements with stride
Arguments : Inputs - in0, in1, pdst, stride
Details : Store 2 double precision floating point elements from 'in0' to (pdst)
@ -83,6 +123,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
Details : Right half of byte elements from 'in0' and 'in1' are
interleaved and written to 'out0'
*/
#define ILVRL_W2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE) __msa_ilvr_w((v4i32) in0, (v4i32) in1); \
out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
}
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
#define ILVRL_D2(RTYPE, in0, in1, out0, out1) \
{ \
out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
@ -90,4 +137,24 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
}
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
/* Description : Transpose 4x4 block with word elements in vectors
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1, out2, out3
Return Type - as per RTYPE
*/
#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
{ \
v4i32 s0_m, s1_m, s2_m, s3_m; \
\
ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
\
out0 = (RTYPE) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
out1 = (RTYPE) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
}
#define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
#endif /* __MACROS_MSA_H__ */

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,177 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
FLOAT * __restrict dst)
{
BLASLONG i, j;
FLOAT *psrc0;
FLOAT *psrc1, *psrc2, *psrc3, *psrc4;
FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
FLOAT *pdst;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
v4f32 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
psrc0 = src;
pdst = dst;
for (j = (n >> 3); j--;)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc5 = psrc4 + lda;
psrc6 = psrc5 + lda;
psrc7 = psrc6 + lda;
psrc8 = psrc7 + lda;
psrc0 += 8 * lda;
for (i = (m >> 3); i--;)
{
LD_SP2(psrc1, 4, src0, src1);
LD_SP2(psrc2, 4, src2, src3);
LD_SP2(psrc3, 4, src4, src5);
LD_SP2(psrc4, 4, src6, src7);
LD_SP2(psrc5, 4, src8, src9);
LD_SP2(psrc6, 4, src10, src11);
LD_SP2(psrc7, 4, src12, src13);
LD_SP2(psrc8, 4, src14, src15);
psrc1 += 8;
psrc2 += 8;
psrc3 += 8;
psrc4 += 8;
psrc5 += 8;
psrc6 += 8;
psrc7 += 8;
psrc8 += 8;
TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6);
TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5,
dst7);
TRANSPOSE4x4_SP_SP(src1, src3, src5, src7, dst8, dst10, dst12,
dst14);
TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13,
dst15);
ST_SP2(dst0, dst1, pdst, 4);
ST_SP2(dst2, dst3, pdst + 8, 4);
ST_SP2(dst4, dst5, pdst + 16, 4);
ST_SP2(dst6, dst7, pdst + 24, 4);
ST_SP2(dst8, dst9, pdst + 32, 4);
ST_SP2(dst10, dst11, pdst + 40, 4);
ST_SP2(dst12, dst13, pdst + 48, 4);
ST_SP2(dst14, dst15, pdst + 56, 4);
pdst += 64;
}
for (i = (m & 7); i--;)
{
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
*pdst++ = *psrc3++;
*pdst++ = *psrc4++;
*pdst++ = *psrc5++;
*pdst++ = *psrc6++;
*pdst++ = *psrc7++;
*pdst++ = *psrc8++;
}
}
if (n & 4)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc0 += 4 * lda;
for (i = (m >> 2); i--;)
{
src0 = LD_SP(psrc1);
src1 = LD_SP(psrc2);
src2 = LD_SP(psrc3);
src3 = LD_SP(psrc4);
psrc1 += 4;
psrc2 += 4;
psrc3 += 4;
psrc4 += 4;
TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
ST_SP2(dst0, dst1, pdst, 4);
ST_SP2(dst2, dst3, pdst + 8, 4);
pdst += 16;
}
for (i = (m & 3); i--;)
{
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
*pdst++ = *psrc3++;
*pdst++ = *psrc4++;
}
}
if (n & 2)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc0 += 2 * lda;
for (i = (m >> 1); i--;)
{
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
}
if (m & 1)
{
*pdst++ = *psrc1++;
*pdst++ = *psrc2++;
}
}
if (n & 1)
{
psrc1 = psrc0;
for (i = m; i--;)
{
*pdst++ = *psrc1++;
}
}
return 0;
}

View File

@ -0,0 +1,292 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
FLOAT * __restrict dst)
{
BLASLONG i, j;
FLOAT *psrc0;
FLOAT *psrc1, *psrc2, *psrc3, *psrc4;
FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4;
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
psrc0 = src;
pdst0 = dst;
pdst2 = dst + m * (n & ~7);
pdst3 = dst + m * (n & ~3);
pdst4 = dst + m * (n & ~1);
for (j = (m >> 3); j--;)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc5 = psrc4 + lda;
psrc6 = psrc5 + lda;
psrc7 = psrc6 + lda;
psrc8 = psrc7 + lda;
psrc0 += 8 * lda;
pdst1 = pdst0;
pdst0 += 64;
for (i = (n >> 3); i--;)
{
LD_SP2(psrc1, 4, src0, src1);
LD_SP2(psrc2, 4, src2, src3);
LD_SP2(psrc3, 4, src4, src5);
LD_SP2(psrc4, 4, src6, src7);
LD_SP2(psrc5, 4, src8, src9);
LD_SP2(psrc6, 4, src10, src11);
LD_SP2(psrc7, 4, src12, src13);
LD_SP2(psrc8, 4, src14, src15);
psrc1 += 8;
psrc2 += 8;
psrc3 += 8;
psrc4 += 8;
psrc5 += 8;
psrc6 += 8;
psrc7 += 8;
psrc8 += 8;
ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15,
pdst1 + 32, 4);
pdst1 += m * 8;
}
if (n & 4)
{
src0 = LD_SP(psrc1);
src1 = LD_SP(psrc2);
src2 = LD_SP(psrc3);
src3 = LD_SP(psrc4);
src4 = LD_SP(psrc5);
src5 = LD_SP(psrc6);
src6 = LD_SP(psrc7);
src7 = LD_SP(psrc8);
psrc1 += 4;
psrc2 += 4;
psrc3 += 4;
psrc4 += 4;
psrc5 += 4;
psrc6 += 4;
psrc7 += 4;
psrc8 += 4;
ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4);
pdst2 += 32;
}
if (n & 2)
{
*pdst3++ = *psrc1++;
*pdst3++ = *psrc1++;
*pdst3++ = *psrc2++;
*pdst3++ = *psrc2++;
*pdst3++ = *psrc3++;
*pdst3++ = *psrc3++;
*pdst3++ = *psrc4++;
*pdst3++ = *psrc4++;
*pdst3++ = *psrc5++;
*pdst3++ = *psrc5++;
*pdst3++ = *psrc6++;
*pdst3++ = *psrc6++;
*pdst3++ = *psrc7++;
*pdst3++ = *psrc7++;
*pdst3++ = *psrc8++;
*pdst3++ = *psrc8++;
}
if (n & 1)
{
*pdst4++ = *psrc1++;
*pdst4++ = *psrc2++;
*pdst4++ = *psrc3++;
*pdst4++ = *psrc4++;
*pdst4++ = *psrc5++;
*pdst4++ = *psrc6++;
*pdst4++ = *psrc7++;
*pdst4++ = *psrc8++;
}
}
if (m & 4)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc3 = psrc2 + lda;
psrc4 = psrc3 + lda;
psrc0 += 4 * lda;
pdst1 = pdst0;
pdst0 += 32;
for (i = (n >> 3); i--;)
{
LD_SP2(psrc1, 4, src0, src1);
LD_SP2(psrc2, 4, src2, src3);
LD_SP2(psrc3, 4, src4, src5);
LD_SP2(psrc4, 4, src6, src7);
psrc1 += 8;
psrc2 += 8;
psrc3 += 8;
psrc4 += 8;
ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
pdst1 += 8 * m;
}
if (n & 4)
{
src0 = LD_SP(psrc1);
src1 = LD_SP(psrc2);
src2 = LD_SP(psrc3);
src3 = LD_SP(psrc4);
psrc1 += 4;
psrc2 += 4;
psrc3 += 4;
psrc4 += 4;
ST_SP4(src0, src1, src2, src3, pdst2, 4);
pdst2 += 16;
}
if (n & 2)
{
*pdst3++ = *psrc1++;
*pdst3++ = *psrc1++;
*pdst3++ = *psrc2++;
*pdst3++ = *psrc2++;
*pdst3++ = *psrc3++;
*pdst3++ = *psrc3++;
*pdst3++ = *psrc4++;
*pdst3++ = *psrc4++;
}
if (n & 1)
{
*pdst4++ = *psrc1++;
*pdst4++ = *psrc2++;
*pdst4++ = *psrc3++;
*pdst4++ = *psrc4++;
}
}
if (m & 2)
{
psrc1 = psrc0;
psrc2 = psrc1 + lda;
psrc0 += 2 * lda;
pdst1 = pdst0;
pdst0 += 16;
for (i = (n >> 3); i--;)
{
LD_SP2(psrc1, 4, src0, src1);
LD_SP2(psrc2, 4, src2, src3);
psrc1 += 8;
psrc2 += 8;
ST_SP4(src0, src1, src2, src3, pdst1, 4);
pdst1 += 8 * m;
}
if (n & 4)
{
src0 = LD_SP(psrc1);
src1 = LD_SP(psrc2);
psrc1 += 4;
psrc2 += 4;
ST_SP2(src0, src1, pdst2, 4);
pdst2 += 8;
}
if (n & 2)
{
*pdst3++ = *psrc1++;
*pdst3++ = *psrc1++;
*pdst3++ = *psrc2++;
*pdst3++ = *psrc2++;
}
if (n & 1)
{
*pdst4++ = *psrc1++;
*pdst4++ = *psrc2++;
}
}
if (m & 1)
{
psrc1 = psrc0;
psrc0 += lda;
pdst1 = pdst0;
pdst0 += 8;
for (i = (n >> 3); i--;)
{
LD_SP2(psrc1, 4, src0, src1);
psrc1 += 8;
ST_SP2(src0, src1, pdst1, 4);
pdst1 += 8 * m;
}
if (n & 4)
{
src0 = LD_SP(psrc1);
psrc1 += 4;
ST_SP(src0, pdst2);
pdst2 += 4;
}
if (n & 2)
{
*pdst3++ = *psrc1++;
*pdst3++ = *psrc1++;
}
if (n & 1)
{
*pdst4++ = *psrc1++;
}
}
return 0;
}

View File

@ -2182,8 +2182,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
@ -2221,8 +2221,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4