diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index ebe52ea8a..4431103bd 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -150,3 +150,6 @@ In chronological order: * theoractice * [2016-03-20] Fix compiler error in VisualStudio with CMake * [2016-03-22] Fix access violation on Windows while static linking + +* Shivraj Patil + * [2016-05-03] DGEMM optimization for MIPS P5600 and I6400 using MSA diff --git a/kernel/mips/KERNEL.P5600 b/kernel/mips/KERNEL.P5600 index 09064fe48..d21575251 100644 --- a/kernel/mips/KERNEL.P5600 +++ b/kernel/mips/KERNEL.P5600 @@ -91,9 +91,13 @@ SGEMMOTCOPY = ../generic/gemm_tcopy_2.c SGEMMONCOPYOBJ = sgemm_oncopy.o SGEMMOTCOPYOBJ = sgemm_otcopy.o -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c +DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c +DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c +DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c +DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c +DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c +DGEMMINCOPYOBJ = dgemm_incopy.o +DGEMMITCOPYOBJ = dgemm_itcopy.o DGEMMONCOPYOBJ = dgemm_oncopy.o DGEMMOTCOPYOBJ = dgemm_otcopy.o diff --git a/kernel/mips/dgemm_kernel_8x4_msa.c b/kernel/mips/dgemm_kernel_8x4_msa.c new file mode 100644 index 000000000..8d9e3455e --- /dev/null +++ b/kernel/mips/dgemm_kernel_8x4_msa.c @@ -0,0 +1,720 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT *A, FLOAT *B, + FLOAT *C, BLASLONG ldc +#ifdef TRMMKERNEL + , BLASLONG offset +#endif + ) +{ + BLASLONG i, j, l; + FLOAT *pc0, *pc1, *pc2, *pc3; + FLOAT *pa0, *pb0; + FLOAT tmp0, tmp1, tmp2, tmp3; + FLOAT a0; + FLOAT b0, b1, b2, b3; + v2f64 v_alpha = {alpha, alpha}; + v2f64 src_a0, src_a1, src_a2, src_a3, src_b, src_b0, src_b1; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + v2f64 res0, res1, res2, res3, res4, res5, res6, res7; + v2f64 res8, res9, res10, res11, res12, res13, res14, res15; + + for (j = (n / 4); j--;) + { + pc0 = C; + pc1 = pc0 + ldc; + pc2 = pc1 + ldc; + pc3 = pc2 + ldc; + + pa0 = A; + + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + res2 = src_a2 * src_b; + res3 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + res6 = src_a2 * src_b; + res7 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 = src_a0 * src_b; + res9 = src_a1 * src_b; + res10 = src_a2 * src_b; + res11 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 = src_a0 * src_b; + res13 = src_a1 * src_b; + res14 = src_a2 * src_b; + res15 = src_a3 * src_b; + + pa0 += 8; + pb0 += 4; + + for (l = (k - 1); l--;) + { + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res8 += src_a0 * src_b; + res9 += src_a1 * src_b; + res10 += src_a2 * src_b; + res11 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res12 += src_a0 * src_b; + res13 += src_a1 * src_b; + res14 += src_a2 * src_b; + res15 += src_a3 * src_b; + + pa0 += 8; + pb0 += 4; + } + + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; + + ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4(dst4, dst5, dst6, dst7, pc1, 2); + + LD_DP4(pc2, 2, dst0, dst1, dst2, dst3); + LD_DP4(pc3, 2, dst4, dst5, dst6, dst7); + + dst0 += res8 * v_alpha; + dst1 += res9 * v_alpha; + dst2 += res10 * v_alpha; + dst3 += res11 * v_alpha; + dst4 += res12 * v_alpha; + dst5 += res13 * v_alpha; + dst6 += res14 * v_alpha; + dst7 += res15 * v_alpha; + + ST_DP4(dst0, dst1, dst2, dst3, pc2, 2); + ST_DP4(dst4, dst5, dst6, dst7, pc3, 2); + + pc0 += 8; + pc1 += 8; + pc2 += 8; + pc3 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 = src_a0 * src_b; + res7 = src_a1 * src_b; + + pa0 += 4; + pb0 += 4; + + for (l = (k - 1); l--;) + { + LD_DP2(pa0, 2, src_a0, src_a1); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res6 += src_a0 * src_b; + res7 += src_a1 * src_b; + + pa0 += 4; + pb0 += 4; + } + + LD_DP2(pc0, 2, dst0, dst1); + LD_DP2(pc1, 2, dst2, dst3); + LD_DP2(pc2, 2, dst4, dst5); + LD_DP2(pc3, 2, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; + + ST_DP2(dst0, dst1, pc0, 2); + ST_DP2(dst2, dst3, pc1, 2); + ST_DP2(dst4, dst5, pc2, 2); + ST_DP2(dst6, dst7, pc3, 2); + + pc0 += 4; + pc1 += 4; + pc2 += 4; + pc3 += 4; + } + + for (i = ((m & 2) / 2); i--;) + { + pb0 = B; + + src_a0 = LD_DP(pa0); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 = src_a0 * src_b; + + pa0 += 2; + pb0 += 4; + + for (l = (k - 1); l--;) + { + src_a0 = LD_DP(pa0); + LD_DP2(pb0, 2, src_b0, src_b1); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b1, (v2i64) src_b1); + res2 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b1, (v2i64) src_b1); + res3 += src_a0 * src_b; + + pa0 += 2; + pb0 += 4; + } + + dst0 = LD_DP(pc0); + dst1 = LD_DP(pc1); + dst2 = LD_DP(pc2); + dst3 = LD_DP(pc3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + + ST_DP(dst0, pc0); + ST_DP(dst1, pc1); + ST_DP(dst2, pc2); + ST_DP(dst3, pc3); + + pc0 += 2; + pc1 += 2; + pc2 += 2; + pc3 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + b2 = pb0[2]; + tmp2 = a0 * b2; + + b3 = pb0[3]; + tmp3 = a0 * b3; + + pa0 += 1; + pb0 += 4; + + for (l = (k - 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + b2 = pb0[2]; + tmp2 += a0 * b2; + + b3 = pb0[3]; + tmp3 += a0 * b3; + + pa0 += 1; + pb0 += 4; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + tmp2 = alpha * tmp2; + tmp3 = alpha * tmp3; + + pc0[0] += tmp0; + pc1[0] += tmp1; + pc2[0] += tmp2; + pc3[0] += tmp3; + + pc0 += 1; + pc1 += 1; + pc2 += 1; + pc3 += 1; + } + + l = (k << 2); + B = B + l; + i = (ldc << 2); + C = C + i; + } + + for (j = ((n & 2) / 2); j--;) + { + pc0 = C; + pc1 = pc0 + ldc; + + pa0 = A; + + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + res2 = src_a2 * src_b; + res3 = src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 = src_a0 * src_b; + res5 = src_a1 * src_b; + res6 = src_a2 * src_b; + res7 = src_a3 * src_b; + + pa0 += 8; + pb0 += 2; + + for (l = (k - 1); l--;) + { + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res4 += src_a0 * src_b; + res5 += src_a1 * src_b; + res6 += src_a2 * src_b; + res7 += src_a3 * src_b; + + pa0 += 8; + pb0 += 2; + } + + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); + LD_DP4(pc1, 2, dst4, dst5, dst6, dst7); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + dst4 += res4 * v_alpha; + dst5 += res5 * v_alpha; + dst6 += res6 * v_alpha; + dst7 += res7 * v_alpha; + + ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); + ST_DP4(dst4, dst5, dst6, dst7, pc1, 2); + + pc0 += 8; + pc1 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + LD_DP2(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 = src_a0 * src_b; + res3 = src_a1 * src_b; + + pa0 += 4; + pb0 += 2; + + for (l = (k - 1); l--;) + { + LD_DP2(pa0, 2, src_a0, src_a1); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res2 += src_a0 * src_b; + res3 += src_a1 * src_b; + + pa0 += 4; + pb0 += 2; + } + + LD_DP2(pc0, 2, dst0, dst1); + LD_DP2(pc1, 2, dst2, dst3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + + ST_DP2(dst0, dst1, pc0, 2); + ST_DP2(dst2, dst3, pc1, 2); + + pc0 += 4; + pc1 += 4; + } + + for (i = ((m & 2) / 2); i--;) + { + pb0 = B; + + src_a0 = LD_DP(pa0); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 = src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 = src_a0 * src_b; + + pa0 += 2; + pb0 += 2; + + for (l = (k - 1); l--;) + { + src_a0 = LD_DP(pa0); + src_b0 = LD_DP(pb0); + + src_b = (v2f64) __msa_ilvr_d((v2i64) src_b0, (v2i64) src_b0); + res0 += src_a0 * src_b; + + src_b = (v2f64) __msa_ilvl_d((v2i64) src_b0, (v2i64) src_b0); + res1 += src_a0 * src_b; + + pa0 += 2; + pb0 += 2; + } + + dst0 = LD_DP(pc0); + dst1 = LD_DP(pc1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + + ST_DP(dst0, pc0); + ST_DP(dst1, pc1); + + pc0 += 2; + pc1 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + b1 = pb0[1]; + tmp1 = a0 * b1; + + pa0 += 1; + pb0 += 2; + + for (l = (k - 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + b1 = pb0[1]; + tmp1 += a0 * b1; + + pa0 += 1; + pb0 += 2; + } + + tmp0 = alpha * tmp0; + tmp1 = alpha * tmp1; + + pc0[0] += tmp0; + pc1[0] += tmp1; + + pc0 += 1; + pc1 += 1; + } + + l = (k << 1); + B = B + l; + i = (ldc << 1); + C = C + i; + } + + for (j = (n & 1); j--;) + { + pc0 = C; + pa0 = A; + + for (i = (m / 8); i--;) + { + pb0 = B; + + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + res2 = src_a2 * src_b; + res3 = src_a3 * src_b; + + pa0 += 8; + pb0 += 1; + + for (l = (k - 1); l--;) + { + LD_DP4(pa0, 2, src_a0, src_a1, src_a2, src_a3); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + res2 += src_a2 * src_b; + res3 += src_a3 * src_b; + + pa0 += 8; + pb0 += 1; + } + + LD_DP4(pc0, 2, dst0, dst1, dst2, dst3); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + dst2 += res2 * v_alpha; + dst3 += res3 * v_alpha; + + ST_DP4(dst0, dst1, dst2, dst3, pc0, 2); + + pc0 += 8; + } + + for (i = ((m & 4) / 4); i--;) + { + pb0 = B; + + LD_DP2(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 = src_a0 * src_b; + res1 = src_a1 * src_b; + + pa0 += 4; + pb0 += 1; + + for (l = (k - 1); l--;) + { + LD_DP2(pa0, 2, src_a0, src_a1); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + res1 += src_a1 * src_b; + + pa0 += 4; + pb0 += 1; + } + + LD_DP2(pc0, 2, dst0, dst1); + + dst0 += res0 * v_alpha; + dst1 += res1 * v_alpha; + + ST_DP2(dst0, dst1, pc0, 2); + + pc0 += 4; + } + + for (i = ((m & 2) / 2); i--;) + { + pb0 = B; + + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 = src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + + for (l = (k - 1); l--;) + { + src_a0 = LD_DP(pa0); + src_b[0] = pb0[0]; + src_b[1] = pb0[0]; + + res0 += src_a0 * src_b; + + pa0 += 2; + pb0 += 1; + } + + dst0 = LD_DP(pc0); + + dst0 += res0 * v_alpha; + + ST_DP(dst0, pc0); + + pc0 += 2; + } + + for (i = (m & 1); i--;) + { + pb0 = B; + + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 = a0 * b0; + + pa0 += 1; + pb0 += 1; + + for (l = (k - 1); l--;) + { + a0 = pa0[0]; + b0 = pb0[0]; + tmp0 += a0 * b0; + + pa0 += 1; + pb0 += 1; + } + + pc0[0] += alpha * tmp0; + + pc0 += 1; + } + + l = (k << 0); + B = B + l; + i = (ldc << 0); + C = C + i; + } + return 0; +} diff --git a/kernel/mips/dgemm_ncopy_4_msa.c b/kernel/mips/dgemm_ncopy_4_msa.c new file mode 100644 index 000000000..bbd76070f --- /dev/null +++ b/kernel/mips/dgemm_ncopy_4_msa.c @@ -0,0 +1,135 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *pdst; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + psrc0 = src; + pdst = dst; + + for (j = (n >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); + + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); + + ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); + pdst += 16; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + psrc1 += 4; + psrc2 += 4; + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + + ST_DP4(dst0, dst4, dst1, dst5, pdst, 2); + pdst += 8; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = (m >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + psrc1 += 4; + + ST_DP2(src0, src1, pdst, 2); + pdst += 4; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_ncopy_8_msa.c b/kernel/mips/dgemm_ncopy_8_msa.c new file mode 100644 index 000000000..43c977582 --- /dev/null +++ b/kernel/mips/dgemm_ncopy_8_msa.c @@ -0,0 +1,228 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *psrc5, *psrc6, *psrc7, *psrc8; + FLOAT *pdst; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7; + + psrc0 = src; + pdst = dst; + + for (j = (n >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + for (i = (m >> 3); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + LD_DP2(psrc5, 2, src8, src9); + LD_DP2(psrc6, 2, src10, src11); + LD_DP2(psrc7, 2, src12, src13); + LD_DP2(psrc8, 2, src14, src15); + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); + + ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2); + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); + + ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 16, + 2); + + LD_DP2(psrc1 + 4, 2, src0, src1); + LD_DP2(psrc2 + 4, 2, src2, src3); + LD_DP2(psrc3 + 4, 2, src4, src5); + LD_DP2(psrc4 + 4, 2, src6, src7); + LD_DP2(psrc5 + 4, 2, src8, src9); + LD_DP2(psrc6 + 4, 2, src10, src11); + LD_DP2(psrc7 + 4, 2, src12, src13); + LD_DP2(psrc8 + 4, 2, src14, src15); + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12); + + ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 32, + 2); + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13); + dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13); + + ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 48, + 2); + + psrc1 += 8; + psrc2 += 8; + psrc3 += 8; + psrc4 += 8; + psrc5 += 8; + psrc6 += 8; + psrc7 += 8; + psrc8 += 8; + pdst += 64; + } + + for (i = (m & 7); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + *pdst++ = *psrc5++; + *pdst++ = *psrc6++; + *pdst++ = *psrc7++; + *pdst++ = *psrc8++; + } + } + + if (n & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + for (i = (m >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0); + dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4); + dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1); + dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5); + + dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0); + dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4); + dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1); + dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5); + + ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2); + pdst += 16; + } + + for (i = (m & 3); i--;) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + *pdst++ = *psrc3++; + *pdst++ = *psrc4++; + } + } + + if (n & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + for (i = (m >> 1); i--;) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + dst0 = (v2f64) __msa_ilvr_d((v2i64) src1, (v2i64) src0); + dst1 = (v2f64) __msa_ilvl_d((v2i64) src1, (v2i64) src0); + + ST_DP2(dst0, dst1, pdst, 2); + pdst += 4; + } + + if (m & 1) + { + *pdst++ = *psrc1++; + *pdst++ = *psrc2++; + } + } + + if (n & 1) + { + psrc1 = psrc0; + + for (i = m; i--;) + { + *pdst++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_tcopy_4_msa.c b/kernel/mips/dgemm_tcopy_4_msa.c new file mode 100644 index 000000000..f147d190e --- /dev/null +++ b/kernel/mips/dgemm_tcopy_4_msa.c @@ -0,0 +1,162 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + + psrc0 = src; + pdst0 = dst; + + pdst2 = dst + m * (n & ~3); + pdst3 = dst + m * (n & ~1); + + for (j = (m >> 2); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + pdst1 += m * 4; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4(src0, src1, src2, src3, pdst2, 2); + pdst2 += 8; + } + + if (n & 1) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + *pdst3++ = *psrc3++; + *pdst3++ = *psrc4++; + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 8; + + for (i = (n >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + psrc1 += 4; + psrc2 += 4; + + ST_DP4(src0, src1, src2, src3, pdst1, 2); + pdst1 += m * 4; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ST_DP2(src0, src1, pdst2, 2); + pdst2 += 4; + } + + if (n & 1) + { + *pdst3++ = *psrc1++; + *pdst3++ = *psrc2++; + } + } + + if (m & 1) + { + psrc1 = psrc0; + pdst1 = pdst0; + + for (i = (n >> 2); i--;) + { + LD_DP2(psrc1, 2, src0, src1); + psrc1 += 4; + + ST_DP2(src0, src1, pdst1, 2); + pdst1 += 4 * m; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + psrc1 += 2; + + ST_DP(src0, pdst2); + } + + if (n & 1) + { + *pdst3 = *psrc1; + } + } + + return 0; +} diff --git a/kernel/mips/dgemm_tcopy_8_msa.c b/kernel/mips/dgemm_tcopy_8_msa.c new file mode 100644 index 000000000..d1ac49b5a --- /dev/null +++ b/kernel/mips/dgemm_tcopy_8_msa.c @@ -0,0 +1,317 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#include "common.h" +#include "macros_msa.h" + +int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda, + FLOAT * __restrict dst) +{ + BLASLONG i, j; + FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4; + FLOAT *psrc5, *psrc6, *psrc7, *psrc8; + FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4; + v2f64 src0, src1, src2, src3, src4, src5, src6, src7; + v2f64 src8, src9, src10, src11, src12, src13, src14, src15; + + psrc0 = src; + pdst0 = dst; + + pdst2 = dst + m * (n & ~7); + pdst3 = dst + m * (n & ~3); + pdst4 = dst + m * (n & ~1); + + for (j = (m >> 3); j--;) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc5 = psrc4 + lda; + psrc6 = psrc5 + lda; + psrc7 = psrc6 + lda; + psrc8 = psrc7 + lda; + psrc0 += 8 * lda; + + pdst1 = pdst0; + pdst0 += 64; + + for (i = (n >> 3); i--;) + { + LD_DP4(psrc1, 2, src0, src1, src2, src3); + LD_DP4(psrc2, 2, src4, src5, src6, src7); + LD_DP4(psrc3, 2, src8, src9, src10, src11); + LD_DP4(psrc4, 2, src12, src13, src14, src15); + psrc1 += 8; + psrc2 += 8; + psrc3 += 8; + psrc4 += 8; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 16, 2); + + LD_DP4(psrc5, 2, src0, src1, src2, src3); + LD_DP4(psrc6, 2, src4, src5, src6, src7); + LD_DP4(psrc7, 2, src8, src9, src10, src11); + LD_DP4(psrc8, 2, src12, src13, src14, src15); + psrc5 += 8; + psrc6 += 8; + psrc7 += 8; + psrc8 += 8; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32, + 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 48, 2); + pdst1 += m * 8; + } + + if (n & 4) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + LD_DP2(psrc5, 2, src8, src9); + LD_DP2(psrc6, 2, src10, src11); + LD_DP2(psrc7, 2, src12, src13); + LD_DP2(psrc8, 2, src14, src15); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + psrc5 += 4; + psrc6 += 4; + psrc7 += 4; + psrc8 += 4; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst2 + 16, 2); + pdst2 += 32; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + src4 = LD_DP(psrc5); + src5 = LD_DP(psrc6); + src6 = LD_DP(psrc7); + src7 = LD_DP(psrc8); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + psrc5 += 2; + psrc6 += 2; + psrc7 += 2; + psrc8 += 2; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2); + pdst3 += 16; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + *pdst4++ = *psrc5++; + *pdst4++ = *psrc6++; + *pdst4++ = *psrc7++; + *pdst4++ = *psrc8++; + } + } + + if (m & 4) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc3 = psrc2 + lda; + psrc4 = psrc3 + lda; + psrc0 += 4 * lda; + + pdst1 = pdst0; + pdst0 += 32; + + for (i = (n >> 3); i--;) + { + LD_DP4(psrc1, 2, src0, src1, src2, src3); + LD_DP4(psrc2, 2, src4, src5, src6, src7); + LD_DP4(psrc3, 2, src8, src9, src10, src11); + LD_DP4(psrc4, 2, src12, src13, src14, src15); + psrc1 += 8; + psrc2 += 8; + psrc3 += 8; + psrc4 += 8; + psrc5 += 8; + psrc6 += 8; + psrc7 += 8; + psrc8 += 8; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15, + pdst1 + 16, 2); + pdst1 += 8 * m; + } + + if (n & 4) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + LD_DP2(psrc3, 2, src4, src5); + LD_DP2(psrc4, 2, src6, src7); + psrc1 += 4; + psrc2 += 4; + psrc3 += 4; + psrc4 += 4; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2); + pdst2 += 16; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + src2 = LD_DP(psrc3); + src3 = LD_DP(psrc4); + psrc1 += 2; + psrc2 += 2; + psrc3 += 2; + psrc4 += 2; + + ST_DP4(src0, src1, src2, src3, pdst3, 2); + pdst3 += 8; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + *pdst4++ = *psrc3++; + *pdst4++ = *psrc4++; + } + } + + if (m & 2) + { + psrc1 = psrc0; + psrc2 = psrc1 + lda; + psrc0 += 2 * lda; + + pdst1 = pdst0; + pdst0 += 16; + + for (i = (n >> 3); i--;) + { + LD_DP4(psrc1, 2, src0, src1, src2, src3); + LD_DP4(psrc2, 2, src4, src5, src6, src7); + psrc1 += 8; + psrc2 += 8; + + ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2); + pdst1 += 8 * m; + } + + if (n & 4) + { + LD_DP2(psrc1, 2, src0, src1); + LD_DP2(psrc2, 2, src2, src3); + psrc1 += 4; + psrc2 += 4; + + ST_DP4(src0, src1, src2, src3, pdst2, 2); + pdst2 += 8; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + src1 = LD_DP(psrc2); + psrc1 += 2; + psrc2 += 2; + + ST_DP2(src0, src1, pdst3, 2); + pdst3 += 4; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + *pdst4++ = *psrc2++; + } + } + + if (m & 1) + { + psrc1 = psrc0; + psrc0 += lda; + + pdst1 = pdst0; + pdst0 += 8; + + for (i = (n >> 3); i--;) + { + LD_DP4(psrc1, 2, src0, src1, src2, src3); + psrc1 += 8; + + ST_DP4(src0, src1, src2, src3, pdst1, 2); + pdst1 += 8 * m; + } + + if (n & 4) + { + LD_DP2(psrc1, 2, src0, src1); + psrc1 += 4; + + ST_DP2(src0, src1, pdst2, 2); + pdst2 += 4; + } + + if (n & 2) + { + src0 = LD_DP(psrc1); + psrc1 += 2; + + ST_DP(src0, pdst3); + pdst3 += 2; + } + + if (n & 1) + { + *pdst4++ = *psrc1++; + } + } + + return 0; +} diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h new file mode 100644 index 000000000..3bcc59629 --- /dev/null +++ b/kernel/mips/macros_msa.h @@ -0,0 +1,79 @@ +/******************************************************************************* +Copyright (c) 2016, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*******************************************************************************/ + +#ifndef __MACROS_MSA_H__ +#define __MACROS_MSA_H__ + +#include + +#define LD_D(RTYPE, psrc) *((RTYPE *)(psrc)) +#define LD_DP(...) LD_D(v2f64, __VA_ARGS__) + +#define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in) +#define ST_DP(...) ST_D(v2f64, __VA_ARGS__) + +/* Description : Load 2 vectors of double precision floating point elements with stride + Arguments : Inputs - psrc, stride + Outputs - out0, out1 + Return Type - double precision floating point +*/ +#define LD_DP2(psrc, stride, out0, out1) \ +{ \ + out0 = LD_DP((psrc)); \ + out1 = LD_DP((psrc) + stride); \ +} + +#define LD_DP4(psrc, stride, out0, out1, out2, out3) \ +{ \ + LD_DP2(psrc, stride, out0, out1) \ + LD_DP2(psrc + 2 * stride, stride, out2, out3) \ +} + +/* Description : Store vectors of double precision floating point elements with stride + Arguments : Inputs - in0, in1, pdst, stride + Details : Store 2 double precision floating point elements from 'in0' to (pdst) + Store 2 double precision floating point elements from 'in1' to (pdst + stride) +*/ +#define ST_DP2(in0, in1, pdst, stride) \ +{ \ + ST_DP(in0, (pdst)); \ + ST_DP(in1, (pdst) + stride); \ +} + +#define ST_DP4(in0, in1, in2, in3, pdst, stride) \ +{ \ + ST_DP2(in0, in1, (pdst), stride); \ + ST_DP2(in2, in3, (pdst) + 2 * stride, stride); \ +} + +#define ST_DP8(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \ +{ \ + ST_DP4(in0, in1, in2, in3, (pdst), stride); \ + ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \ +} + +#endif /* __MACROS_MSA_H__ */ diff --git a/param.h b/param.h index 93b1220d6..6948e6a76 100644 --- a/param.h +++ b/param.h @@ -2185,8 +2185,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2 @@ -2224,8 +2224,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_UNROLL_M 2 #define SGEMM_DEFAULT_UNROLL_N 2 -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 #define CGEMM_DEFAULT_UNROLL_M 2 #define CGEMM_DEFAULT_UNROLL_N 2