Added CGEMM, ZGEMM, STRMM, DTRMM, CTRMM, ZTRMM. Updated macros in SGEMM, DGEMM, STRMM.
Signed-off-by: Shivraj Patil <shivraj.patil@imgtec.com>
This commit is contained in:
parent
437c7d64f2
commit
57df7956ee
|
@ -80,11 +80,6 @@ DGEMVTKERNEL = ../mips/gemv_t.c
|
|||
CGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
|
||||
STRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
|
||||
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||
|
||||
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c
|
||||
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c
|
||||
SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c
|
||||
|
@ -101,15 +96,19 @@ DGEMMITCOPYOBJ = dgemm_itcopy.o
|
|||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c
|
||||
CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c
|
||||
CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c
|
||||
CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c
|
||||
CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c
|
||||
CGEMMINCOPYOBJ = cgemm_incopy.o
|
||||
CGEMMITCOPYOBJ = cgemm_itcopy.o
|
||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
||||
|
||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||
ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c
|
||||
ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c
|
||||
ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c
|
||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,195 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
FLOAT ctemp05, ctemp06, ctemp07, ctemp08;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 dst0, dst1, dst4, dst5;
|
||||
|
||||
psrc0 = src;
|
||||
pdst = dst;
|
||||
lda *= 2;
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
LD_SP2_INC(psrc3, 4, src4, src5);
|
||||
LD_SP2_INC(psrc4, 4, src6, src7);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_SP(src7, src5, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
src4 = LD_SP(psrc3);
|
||||
src6 = LD_SP(psrc4);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
ctemp05 = *(psrc3 + 0);
|
||||
ctemp06 = *(psrc3 + 1);
|
||||
ctemp07 = *(psrc4 + 0);
|
||||
ctemp08 = *(psrc4 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
*(pdst + 4) = ctemp05;
|
||||
*(pdst + 5) = ctemp06;
|
||||
*(pdst + 6) = ctemp07;
|
||||
*(pdst + 7) = ctemp08;
|
||||
pdst += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
pdst += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
ST_SP2_INC(src0, src1, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
psrc1 += 4;
|
||||
|
||||
ST_SP(src0, pdst);
|
||||
pdst += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
psrc1 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
pdst += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,310 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
|
||||
FLOAT *psrc8, *pdst;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04, ctemp05, ctemp06, ctemp07;
|
||||
FLOAT ctemp08, ctemp09, ctemp10, ctemp11, ctemp12, ctemp13, ctemp14;
|
||||
FLOAT ctemp15, ctemp16;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
|
||||
psrc0 = src;
|
||||
pdst = dst;
|
||||
lda *= 2;
|
||||
|
||||
for (j = (n >> 3); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc5 = psrc4 + lda;
|
||||
psrc6 = psrc5 + lda;
|
||||
psrc7 = psrc6 + lda;
|
||||
psrc8 = psrc7 + lda;
|
||||
psrc0 += 8 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
LD_SP2_INC(psrc3, 4, src4, src5);
|
||||
LD_SP2_INC(psrc4, 4, src6, src7);
|
||||
LD_SP2_INC(psrc5, 4, src8, src9);
|
||||
LD_SP2_INC(psrc6, 4, src10, src11);
|
||||
LD_SP2_INC(psrc7, 4, src12, src13);
|
||||
LD_SP2_INC(psrc8, 4, src14, src15);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_SP(src10, src8, dst2, dst6);
|
||||
ILVRL_D2_SP(src14, src12, dst3, dst7);
|
||||
|
||||
ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_SP(src7, src5, dst1, dst5);
|
||||
ILVRL_D2_SP(src11, src9, dst2, dst6);
|
||||
ILVRL_D2_SP(src15, src13, dst3, dst7);
|
||||
|
||||
ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
src4 = LD_SP(psrc3);
|
||||
src6 = LD_SP(psrc4);
|
||||
src8 = LD_SP(psrc5);
|
||||
src10 = LD_SP(psrc6);
|
||||
src12 = LD_SP(psrc7);
|
||||
src14 = LD_SP(psrc8);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
psrc5 += 4;
|
||||
psrc6 += 4;
|
||||
psrc7 += 4;
|
||||
psrc8 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_SP(src10, src8, dst2, dst6);
|
||||
ILVRL_D2_SP(src14, src12, dst3, dst7);
|
||||
|
||||
ST_SP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
ctemp05 = *(psrc3 + 0);
|
||||
ctemp06 = *(psrc3 + 1);
|
||||
ctemp07 = *(psrc4 + 0);
|
||||
ctemp08 = *(psrc4 + 1);
|
||||
ctemp09 = *(psrc5 + 0);
|
||||
ctemp10 = *(psrc5 + 1);
|
||||
ctemp11 = *(psrc6 + 0);
|
||||
ctemp12 = *(psrc6 + 1);
|
||||
ctemp13 = *(psrc7 + 0);
|
||||
ctemp14 = *(psrc7 + 1);
|
||||
ctemp15 = *(psrc8 + 0);
|
||||
ctemp16 = *(psrc8 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
psrc5 += 2;
|
||||
psrc6 += 2;
|
||||
psrc7 += 2;
|
||||
psrc8 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
*(pdst + 4) = ctemp05;
|
||||
*(pdst + 5) = ctemp06;
|
||||
*(pdst + 6) = ctemp07;
|
||||
*(pdst + 7) = ctemp08;
|
||||
*(pdst + 8) = ctemp09;
|
||||
*(pdst + 9) = ctemp10;
|
||||
*(pdst + 10) = ctemp11;
|
||||
*(pdst + 11) = ctemp12;
|
||||
*(pdst + 12) = ctemp13;
|
||||
*(pdst + 13) = ctemp14;
|
||||
*(pdst + 14) = ctemp15;
|
||||
*(pdst + 15) = ctemp16;
|
||||
pdst += 16;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
LD_SP2_INC(psrc3, 4, src4, src5);
|
||||
LD_SP2_INC(psrc4, 4, src6, src7);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_SP(src7, src5, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
src4 = LD_SP(psrc3);
|
||||
src6 = LD_SP(psrc4);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_SP(src6, src4, dst1, dst5);
|
||||
|
||||
ST_SP4_INC(dst0, dst1, dst4, dst5, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
ctemp05 = *(psrc3 + 0);
|
||||
ctemp06 = *(psrc3 + 1);
|
||||
ctemp07 = *(psrc4 + 0);
|
||||
ctemp08 = *(psrc4 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
*(pdst + 4) = ctemp05;
|
||||
*(pdst + 5) = ctemp06;
|
||||
*(pdst + 6) = ctemp07;
|
||||
*(pdst + 7) = ctemp08;
|
||||
pdst += 8;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
|
||||
ILVRL_D2_SP(src3, src1, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src2 = LD_SP(psrc2);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
|
||||
ILVRL_D2_SP(src2, src0, dst0, dst4);
|
||||
|
||||
ST_SP2_INC(dst0, dst4, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
*(pdst + 2) = ctemp03;
|
||||
*(pdst + 3) = ctemp04;
|
||||
pdst += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
ST_SP2_INC(src0, src1, pdst, 4);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
psrc1 += 4;
|
||||
|
||||
ST_SP(src0, pdst);
|
||||
pdst += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
psrc1 += 2;
|
||||
|
||||
*(pdst + 0) = ctemp01;
|
||||
*(pdst + 1) = ctemp02;
|
||||
pdst += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,125 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0;
|
||||
FLOAT *psrc1, *psrc2;
|
||||
FLOAT *pdst0;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
v4f32 src0, src1, src2, src3;
|
||||
|
||||
psrc0 = src;
|
||||
pdst0 = dst;
|
||||
lda *= 2;
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 8;
|
||||
|
||||
for (i = (m >> 1); i--;)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
ST_SP2_INC(src0, src1, pdst0, 4);
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 4;
|
||||
|
||||
for (i = (m >> 1); i--;)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src1 = LD_SP(psrc2);
|
||||
ST_SP2_INC(src0, src1, pdst0, 4);
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
ST_SP(src0, pdst0);
|
||||
pdst0 += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 2;
|
||||
|
||||
for (i = (m >> 1); i--;)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
*(pdst0 + 2) = ctemp03;
|
||||
*(pdst0 + 3) = ctemp04;
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
pdst0 += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
pdst0 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,214 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *pdst0;
|
||||
FLOAT ctemp01, ctemp02, ctemp03, ctemp04;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
|
||||
psrc0 = src;
|
||||
pdst0 = dst;
|
||||
lda *= 2;
|
||||
|
||||
for (j = (n >> 3); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 16;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP4(psrc1, 4, src0, src1, src2, src3);
|
||||
LD_SP4(psrc2, 4, src4, src5, src6, src7);
|
||||
LD_SP4(psrc1 + 2 * lda, 4, src8, src9, src10, src11);
|
||||
LD_SP4(psrc2 + 2 * lda, 4, src12, src13, src14, src15);
|
||||
ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
|
||||
ST_SP8_INC(src8, src9, src10, src11, src12, src13, src14, src15, pdst0, 4);
|
||||
psrc1 += 4 * lda;
|
||||
psrc2 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
LD_SP4(psrc1, 4, src0, src1, src2, src3);
|
||||
LD_SP4(psrc2, 4, src4, src5, src6, src7);
|
||||
ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst0, 4);
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
LD_SP4(psrc1, 4, src0, src1, src2, src3);
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 8;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
LD_SP2(psrc1 + 2 * lda, 4, src4, src5);
|
||||
LD_SP2(psrc2 + 2 * lda, 4, src6, src7);
|
||||
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
ST_SP4_INC(src4, src5, src6, src7, pdst0, 4);
|
||||
psrc1 += 4 * lda;
|
||||
psrc2 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
ST_SP2_INC(src0, src1, pdst0, 4);
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 4;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src1 = LD_SP(psrc2);
|
||||
src2 = LD_SP(psrc1 + 2 * lda);
|
||||
src3 = LD_SP(psrc2 + 2 * lda);
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst0, 4);
|
||||
|
||||
psrc1 += 4 * lda;
|
||||
psrc2 += 4 * lda;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
src1 = LD_SP(psrc2);
|
||||
ST_SP2_INC(src0, src1, pdst0, 4);
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
src0 = LD_SP(psrc1);
|
||||
ST_SP(src0, pdst0);
|
||||
pdst0 += 4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc0 + lda;
|
||||
psrc0 += 2;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
*(pdst0 + 2) = ctemp03;
|
||||
*(pdst0 + 3) = ctemp04;
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
pdst0 += 4;
|
||||
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
*(pdst0 + 2) = ctemp03;
|
||||
*(pdst0 + 3) = ctemp04;
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
pdst0 += 4;
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
ctemp03 = *(psrc2 + 0);
|
||||
ctemp04 = *(psrc2 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
*(pdst0 + 2) = ctemp03;
|
||||
*(pdst0 + 3) = ctemp04;
|
||||
|
||||
psrc1 += 2 * lda;
|
||||
psrc2 += 2 * lda;
|
||||
pdst0 += 4;
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
ctemp01 = *(psrc1 + 0);
|
||||
ctemp02 = *(psrc1 + 1);
|
||||
|
||||
*(pdst0 + 0) = ctemp01;
|
||||
*(pdst0 + 1) = ctemp02;
|
||||
pdst0 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -32,8 +32,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
FLOAT * __restrict dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
|
||||
FLOAT *pdst;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
|
||||
|
@ -50,28 +49,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
LD_DP2(psrc2, 2, src2, src3);
|
||||
LD_DP2(psrc3, 2, src4, src5);
|
||||
LD_DP2(psrc4, 2, src6, src7);
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_DP(src3, src1, dst2, dst6);
|
||||
ILVRL_D2_DP(src7, src5, dst3, dst7);
|
||||
|
||||
dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
|
||||
dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
|
||||
dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
|
||||
dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
|
||||
|
||||
dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
|
||||
dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
|
||||
dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
|
||||
dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
|
||||
|
||||
ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
|
||||
pdst += 16;
|
||||
ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
|
||||
}
|
||||
|
||||
for (i = (m & 3); i--;)
|
||||
|
@ -91,18 +79,13 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
LD_DP2(psrc2, 2, src2, src3);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
|
||||
dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
|
||||
dst1 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
|
||||
dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
|
||||
dst5 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src3, src1, dst1, dst5);
|
||||
|
||||
ST_DP4(dst0, dst4, dst1, dst5, pdst, 2);
|
||||
pdst += 8;
|
||||
ST_DP4_INC(dst0, dst4, dst1, dst5, pdst, 2);
|
||||
}
|
||||
|
||||
for (i = (m & 3); i--;)
|
||||
|
|
|
@ -32,9 +32,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
FLOAT * __restrict dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
|
||||
FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
|
||||
FLOAT *pdst;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
|
||||
FLOAT *psrc8, *pdst;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
v2f64 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
|
@ -56,80 +55,51 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (m >> 3); i--;)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
LD_DP2(psrc2, 2, src2, src3);
|
||||
LD_DP2(psrc3, 2, src4, src5);
|
||||
LD_DP2(psrc4, 2, src6, src7);
|
||||
LD_DP2(psrc5, 2, src8, src9);
|
||||
LD_DP2(psrc6, 2, src10, src11);
|
||||
LD_DP2(psrc7, 2, src12, src13);
|
||||
LD_DP2(psrc8, 2, src14, src15);
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
LD_DP2_INC(psrc5, 2, src8, src9);
|
||||
LD_DP2_INC(psrc6, 2, src10, src11);
|
||||
LD_DP2_INC(psrc7, 2, src12, src13);
|
||||
LD_DP2_INC(psrc8, 2, src14, src15);
|
||||
|
||||
dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
|
||||
dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
|
||||
dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8);
|
||||
dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12);
|
||||
dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
|
||||
dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
|
||||
dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8);
|
||||
dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12);
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_DP(src10, src8, dst2, dst6);
|
||||
ILVRL_D2_DP(src14, src12, dst3, dst7);
|
||||
|
||||
ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
|
||||
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
|
||||
|
||||
dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
|
||||
dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
|
||||
dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9);
|
||||
dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13);
|
||||
dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
|
||||
dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
|
||||
dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9);
|
||||
dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13);
|
||||
ILVRL_D2_DP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_DP(src7, src5, dst1, dst5);
|
||||
ILVRL_D2_DP(src11, src9, dst2, dst6);
|
||||
ILVRL_D2_DP(src15, src13, dst3, dst7);
|
||||
|
||||
ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 16,
|
||||
2);
|
||||
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
|
||||
|
||||
LD_DP2(psrc1 + 4, 2, src0, src1);
|
||||
LD_DP2(psrc2 + 4, 2, src2, src3);
|
||||
LD_DP2(psrc3 + 4, 2, src4, src5);
|
||||
LD_DP2(psrc4 + 4, 2, src6, src7);
|
||||
LD_DP2(psrc5 + 4, 2, src8, src9);
|
||||
LD_DP2(psrc6 + 4, 2, src10, src11);
|
||||
LD_DP2(psrc7 + 4, 2, src12, src13);
|
||||
LD_DP2(psrc8 + 4, 2, src14, src15);
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
LD_DP2_INC(psrc5, 2, src8, src9);
|
||||
LD_DP2_INC(psrc6, 2, src10, src11);
|
||||
LD_DP2_INC(psrc7, 2, src12, src13);
|
||||
LD_DP2_INC(psrc8, 2, src14, src15);
|
||||
|
||||
dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
|
||||
dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
|
||||
dst2 = (v2f64) __msa_ilvr_d((v2i64) src10, (v2i64) src8);
|
||||
dst3 = (v2f64) __msa_ilvr_d((v2i64) src14, (v2i64) src12);
|
||||
dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
|
||||
dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
|
||||
dst6 = (v2f64) __msa_ilvl_d((v2i64) src10, (v2i64) src8);
|
||||
dst7 = (v2f64) __msa_ilvl_d((v2i64) src14, (v2i64) src12);
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_DP(src10, src8, dst2, dst6);
|
||||
ILVRL_D2_DP(src14, src12, dst3, dst7);
|
||||
|
||||
ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 32,
|
||||
2);
|
||||
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
|
||||
|
||||
dst0 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
|
||||
dst1 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
|
||||
dst2 = (v2f64) __msa_ilvr_d((v2i64) src11, (v2i64) src9);
|
||||
dst3 = (v2f64) __msa_ilvr_d((v2i64) src15, (v2i64) src13);
|
||||
dst4 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
|
||||
dst5 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
|
||||
dst6 = (v2f64) __msa_ilvl_d((v2i64) src11, (v2i64) src9);
|
||||
dst7 = (v2f64) __msa_ilvl_d((v2i64) src15, (v2i64) src13);
|
||||
ILVRL_D2_DP(src3, src1, dst0, dst4);
|
||||
ILVRL_D2_DP(src7, src5, dst1, dst5);
|
||||
ILVRL_D2_DP(src11, src9, dst2, dst6);
|
||||
ILVRL_D2_DP(src15, src13, dst3, dst7);
|
||||
|
||||
ST_DP8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst + 48,
|
||||
2);
|
||||
|
||||
psrc1 += 8;
|
||||
psrc2 += 8;
|
||||
psrc3 += 8;
|
||||
psrc4 += 8;
|
||||
psrc5 += 8;
|
||||
psrc6 += 8;
|
||||
psrc7 += 8;
|
||||
psrc8 += 8;
|
||||
pdst += 64;
|
||||
ST_DP8_INC(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, pdst, 2);
|
||||
}
|
||||
|
||||
for (i = (m & 7); i--;)
|
||||
|
@ -155,27 +125,17 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
LD_DP2(psrc2, 2, src2, src3);
|
||||
LD_DP2(psrc3, 2, src4, src5);
|
||||
LD_DP2(psrc4, 2, src6, src7);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
|
||||
dst0 = (v2f64) __msa_ilvr_d((v2i64) src2, (v2i64) src0);
|
||||
dst1 = (v2f64) __msa_ilvr_d((v2i64) src6, (v2i64) src4);
|
||||
dst2 = (v2f64) __msa_ilvr_d((v2i64) src3, (v2i64) src1);
|
||||
dst3 = (v2f64) __msa_ilvr_d((v2i64) src7, (v2i64) src5);
|
||||
ILVRL_D2_DP(src2, src0, dst0, dst4);
|
||||
ILVRL_D2_DP(src6, src4, dst1, dst5);
|
||||
ILVRL_D2_DP(src3, src1, dst2, dst6);
|
||||
ILVRL_D2_DP(src7, src5, dst3, dst7);
|
||||
|
||||
dst4 = (v2f64) __msa_ilvl_d((v2i64) src2, (v2i64) src0);
|
||||
dst5 = (v2f64) __msa_ilvl_d((v2i64) src6, (v2i64) src4);
|
||||
dst6 = (v2f64) __msa_ilvl_d((v2i64) src3, (v2i64) src1);
|
||||
dst7 = (v2f64) __msa_ilvl_d((v2i64) src7, (v2i64) src5);
|
||||
|
||||
ST_DP8(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
|
||||
pdst += 16;
|
||||
ST_DP8_INC(dst0, dst1, dst4, dst5, dst2, dst3, dst6, dst7, pdst, 2);
|
||||
}
|
||||
|
||||
for (i = (m & 3); i--;)
|
||||
|
@ -200,11 +160,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
dst0 = (v2f64) __msa_ilvr_d((v2i64) src1, (v2i64) src0);
|
||||
dst1 = (v2f64) __msa_ilvl_d((v2i64) src1, (v2i64) src0);
|
||||
ILVRL_D2_DP(src1, src0, dst0, dst1);
|
||||
|
||||
ST_DP2(dst0, dst1, pdst, 2);
|
||||
pdst += 4;
|
||||
ST_DP2_INC(dst0, dst1, pdst, 2);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
|
|
|
@ -55,14 +55,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
LD_DP2(psrc2, 2, src2, src3);
|
||||
LD_DP2(psrc3, 2, src4, src5);
|
||||
LD_DP2(psrc4, 2, src6, src7);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
pdst1 += m * 4;
|
||||
|
@ -79,8 +75,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
ST_DP4(src0, src1, src2, src3, pdst2, 2);
|
||||
pdst2 += 8;
|
||||
ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
|
@ -103,10 +98,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
LD_DP2(psrc2, 2, src2, src3);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
|
||||
ST_DP4(src0, src1, src2, src3, pdst1, 2);
|
||||
pdst1 += m * 4;
|
||||
|
@ -119,8 +112,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
ST_DP2(src0, src1, pdst2, 2);
|
||||
pdst2 += 4;
|
||||
ST_DP2_INC(src0, src1, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
|
@ -137,8 +129,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
psrc1 += 4;
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
|
||||
ST_DP2(src0, src1, pdst1, 2);
|
||||
pdst1 += 4 * m;
|
||||
|
|
|
@ -62,27 +62,19 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP4(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4(psrc2, 2, src4, src5, src6, src7);
|
||||
LD_DP4(psrc3, 2, src8, src9, src10, src11);
|
||||
LD_DP4(psrc4, 2, src12, src13, src14, src15);
|
||||
psrc1 += 8;
|
||||
psrc2 += 8;
|
||||
psrc3 += 8;
|
||||
psrc4 += 8;
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
|
||||
LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
pdst1 + 16, 2);
|
||||
|
||||
LD_DP4(psrc5, 2, src0, src1, src2, src3);
|
||||
LD_DP4(psrc6, 2, src4, src5, src6, src7);
|
||||
LD_DP4(psrc7, 2, src8, src9, src10, src11);
|
||||
LD_DP4(psrc8, 2, src12, src13, src14, src15);
|
||||
psrc5 += 8;
|
||||
psrc6 += 8;
|
||||
psrc7 += 8;
|
||||
psrc8 += 8;
|
||||
LD_DP4_INC(psrc5, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc6, 2, src4, src5, src6, src7);
|
||||
LD_DP4_INC(psrc7, 2, src8, src9, src10, src11);
|
||||
LD_DP4_INC(psrc8, 2, src12, src13, src14, src15);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1 + 32,
|
||||
2);
|
||||
|
@ -93,27 +85,18 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
LD_DP2(psrc2, 2, src2, src3);
|
||||
LD_DP2(psrc3, 2, src4, src5);
|
||||
LD_DP2(psrc4, 2, src6, src7);
|
||||
LD_DP2(psrc5, 2, src8, src9);
|
||||
LD_DP2(psrc6, 2, src10, src11);
|
||||
LD_DP2(psrc7, 2, src12, src13);
|
||||
LD_DP2(psrc8, 2, src14, src15);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
psrc5 += 4;
|
||||
psrc6 += 4;
|
||||
psrc7 += 4;
|
||||
psrc8 += 4;
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
LD_DP2_INC(psrc5, 2, src8, src9);
|
||||
LD_DP2_INC(psrc6, 2, src10, src11);
|
||||
LD_DP2_INC(psrc7, 2, src12, src13);
|
||||
LD_DP2_INC(psrc8, 2, src14, src15);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
|
||||
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
pdst2 + 16, 2);
|
||||
pdst2 += 32;
|
||||
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
|
||||
ST_DP8_INC(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -135,8 +118,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
psrc7 += 2;
|
||||
psrc8 += 2;
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2);
|
||||
pdst3 += 16;
|
||||
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst3, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
|
@ -165,18 +147,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP4(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4(psrc2, 2, src4, src5, src6, src7);
|
||||
LD_DP4(psrc3, 2, src8, src9, src10, src11);
|
||||
LD_DP4(psrc4, 2, src12, src13, src14, src15);
|
||||
psrc1 += 8;
|
||||
psrc2 += 8;
|
||||
psrc3 += 8;
|
||||
psrc4 += 8;
|
||||
psrc5 += 8;
|
||||
psrc6 += 8;
|
||||
psrc7 += 8;
|
||||
psrc8 += 8;
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
|
||||
LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
|
@ -186,17 +160,12 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
LD_DP2(psrc2, 2, src2, src3);
|
||||
LD_DP2(psrc3, 2, src4, src5);
|
||||
LD_DP2(psrc4, 2, src6, src7);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
|
||||
pdst2 += 16;
|
||||
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -210,8 +179,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
ST_DP4(src0, src1, src2, src3, pdst3, 2);
|
||||
pdst3 += 8;
|
||||
ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
|
@ -234,10 +202,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP4(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4(psrc2, 2, src4, src5, src6, src7);
|
||||
psrc1 += 8;
|
||||
psrc2 += 8;
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
pdst1 += 8 * m;
|
||||
|
@ -245,13 +211,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
LD_DP2(psrc2, 2, src2, src3);
|
||||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
|
||||
ST_DP4(src0, src1, src2, src3, pdst2, 2);
|
||||
pdst2 += 8;
|
||||
ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -261,8 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
ST_DP2(src0, src1, pdst3, 2);
|
||||
pdst3 += 4;
|
||||
ST_DP2_INC(src0, src1, pdst3, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
|
@ -282,8 +244,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP4(psrc1, 2, src0, src1, src2, src3);
|
||||
psrc1 += 8;
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
|
||||
ST_DP4(src0, src1, src2, src3, pdst1, 2);
|
||||
pdst1 += 8 * m;
|
||||
|
@ -291,11 +252,9 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2(psrc1, 2, src0, src1);
|
||||
psrc1 += 4;
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
|
||||
ST_DP2(src0, src1, pdst2, 2);
|
||||
pdst2 += 4;
|
||||
ST_DP2_INC(src0, src1, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
|
|
@ -42,10 +42,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define ST_D(RTYPE, in, pdst) *((RTYPE *)(pdst)) = (in)
|
||||
#define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
|
||||
|
||||
#define COPY_FLOAT_TO_VECTOR(a, b) \
|
||||
b = __msa_cast_to_vector_float(a); \
|
||||
b = (v4f32) __msa_splati_w((v4i32) b, 0);
|
||||
#define COPY_FLOAT_TO_VECTOR(a) ( { \
|
||||
v4f32 out; \
|
||||
out = __msa_cast_to_vector_float(a); \
|
||||
out = (v4f32) __msa_splati_w((v4i32) out, 0); \
|
||||
out; \
|
||||
} )
|
||||
|
||||
#define COPY_DOUBLE_TO_VECTOR(a) ( { \
|
||||
v2f64 out; \
|
||||
out = __msa_cast_to_vector_double(a); \
|
||||
out = (v2f64) __msa_splati_d((v2i64) out, 0); \
|
||||
out; \
|
||||
} )
|
||||
|
||||
/* Description : Load 2 variables with stride
|
||||
Arguments : Inputs - psrc, stride
|
||||
Outputs - out0, out1
|
||||
*/
|
||||
#define LD_GP2_INC(psrc, stride, out0, out1) \
|
||||
{ \
|
||||
out0 = *(psrc); \
|
||||
(psrc) += stride; \
|
||||
out1 = *(psrc); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_GP3_INC(psrc, stride, out0, \
|
||||
out1, out2) \
|
||||
{ \
|
||||
LD_GP2_INC(psrc, stride, out0, out1); \
|
||||
out2 = *(psrc); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_GP4_INC(psrc, stride, out0, \
|
||||
out1, out2, out3) \
|
||||
{ \
|
||||
LD_GP2_INC(psrc, stride, out0, out1); \
|
||||
LD_GP2_INC(psrc, stride, out2, out3); \
|
||||
}
|
||||
|
||||
#define LD_GP5_INC(psrc, stride, out0, \
|
||||
out1, out2, out3, out4) \
|
||||
{ \
|
||||
LD_GP2_INC(psrc, stride, out0, out1); \
|
||||
LD_GP2_INC(psrc, stride, out2, out3); \
|
||||
out4 = *(psrc); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_GP6_INC(psrc, stride, out0, \
|
||||
out1, out2, out3, \
|
||||
out4, out5) \
|
||||
{ \
|
||||
LD_GP2_INC(psrc, stride, out0, out1); \
|
||||
LD_GP2_INC(psrc, stride, out2, out3); \
|
||||
LD_GP2_INC(psrc, stride, out4, out5); \
|
||||
}
|
||||
|
||||
#define LD_GP7_INC(psrc, stride, out0, \
|
||||
out1, out2, out3, \
|
||||
out4, out5, out6) \
|
||||
{ \
|
||||
LD_GP2_INC(psrc, stride, out0, out1); \
|
||||
LD_GP2_INC(psrc, stride, out2, out3); \
|
||||
LD_GP2_INC(psrc, stride, out4, out5); \
|
||||
out6 = *(psrc); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_GP8_INC(psrc, stride, out0, out1, out2, \
|
||||
out3, out4, out5, out6, out7) \
|
||||
{ \
|
||||
LD_GP4_INC(psrc, stride, out0, out1, out2, out3); \
|
||||
LD_GP4_INC(psrc, stride, out4, out5, out6, out7); \
|
||||
}
|
||||
|
||||
/* Description : Load 2 vectors of single precision floating point elements with stride
|
||||
Arguments : Inputs - psrc, stride
|
||||
|
@ -58,6 +130,82 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
out1 = LD_SP((psrc) + stride); \
|
||||
}
|
||||
|
||||
#define LD_SP4(psrc, stride, out0, out1, out2, out3) \
|
||||
{ \
|
||||
LD_SP2(psrc, stride, out0, out1) \
|
||||
LD_SP2(psrc + 2 * stride, stride, out2, out3) \
|
||||
}
|
||||
|
||||
#define LD_SP2_INC(psrc, stride, out0, out1) \
|
||||
{ \
|
||||
out0 = LD_SP((psrc)); \
|
||||
(psrc) += stride; \
|
||||
out1 = LD_SP((psrc)); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_SP3_INC(psrc, stride, out0, \
|
||||
out1, out2) \
|
||||
{ \
|
||||
LD_SP2_INC(psrc, stride, out0, out1); \
|
||||
out2 = LD_SP((psrc)); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_SP4_INC(psrc, stride, out0, \
|
||||
out1, out2, out3) \
|
||||
{ \
|
||||
LD_SP2_INC(psrc, stride, out0, out1); \
|
||||
LD_SP2_INC(psrc, stride, out2, out3); \
|
||||
}
|
||||
|
||||
#define LD_SP5_INC(psrc, stride, out0, \
|
||||
out1, out2, out3, out4) \
|
||||
{ \
|
||||
LD_SP2_INC(psrc, stride, out0, out1); \
|
||||
LD_SP2_INC(psrc, stride, out2, out3); \
|
||||
out4 = LD_SP((psrc)); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_SP6_INC(psrc, stride, out0, \
|
||||
out1, out2, out3, \
|
||||
out4, out5) \
|
||||
{ \
|
||||
LD_SP2_INC(psrc, stride, out0, out1); \
|
||||
LD_SP2_INC(psrc, stride, out2, out3); \
|
||||
LD_SP2_INC(psrc, stride, out4, out5); \
|
||||
}
|
||||
|
||||
#define LD_SP7_INC(psrc, stride, out0, \
|
||||
out1, out2, out3, \
|
||||
out4, out5, out6) \
|
||||
{ \
|
||||
LD_SP2_INC(psrc, stride, out0, out1); \
|
||||
LD_SP2_INC(psrc, stride, out2, out3); \
|
||||
LD_SP2_INC(psrc, stride, out4, out5); \
|
||||
out6 = LD_SP((psrc)); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_SP8_INC(psrc, stride, out0, out1, out2, \
|
||||
out3, out4, out5, out6, out7) \
|
||||
{ \
|
||||
LD_SP4_INC(psrc, stride, out0, out1, out2, out3); \
|
||||
LD_SP4_INC(psrc, stride, out4, out5, out6, out7); \
|
||||
}
|
||||
|
||||
#define LD_SP16_INC(psrc, stride, out0, out1, out2, \
|
||||
out3, out4, out5, out6, out7, out8, \
|
||||
out9, out10, out11, out12, out13, \
|
||||
out14, out15) \
|
||||
{ \
|
||||
LD_SP8_INC(psrc, stride, out0, out1, out2, \
|
||||
out3, out4, out5, out6, out7); \
|
||||
LD_SP8_INC(psrc, stride, out8, out9, out10, \
|
||||
out11, out12, out13, out14, out15); \
|
||||
}
|
||||
|
||||
/* Description : Load 2 vectors of double precision floating point elements with stride
|
||||
Arguments : Inputs - psrc, stride
|
||||
Outputs - out0, out1
|
||||
|
@ -75,6 +223,139 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
LD_DP2(psrc + 2 * stride, stride, out2, out3) \
|
||||
}
|
||||
|
||||
#define LD_DP2_INC(psrc, stride, out0, out1) \
|
||||
{ \
|
||||
out0 = LD_DP(psrc); \
|
||||
(psrc) += stride; \
|
||||
out1 = LD_DP(psrc); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_DP3_INC(psrc, stride, out0, \
|
||||
out1, out2) \
|
||||
{ \
|
||||
LD_DP2_INC(psrc, stride, out0, out1); \
|
||||
out2 = LD_DP((psrc)); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_DP4_INC(psrc, stride, out0, \
|
||||
out1, out2, out3) \
|
||||
{ \
|
||||
LD_DP2_INC(psrc, stride, out0, out1); \
|
||||
LD_DP2_INC(psrc, stride, out2, out3); \
|
||||
}
|
||||
|
||||
#define LD_DP5_INC(psrc, stride, out0, \
|
||||
out1, out2, out3, out4) \
|
||||
{ \
|
||||
LD_DP2_INC(psrc, stride, out0, out1); \
|
||||
LD_DP2_INC(psrc, stride, out2, out3); \
|
||||
out4 = LD_DP((psrc)); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_DP6_INC(psrc, stride, out0, \
|
||||
out1, out2, out3, \
|
||||
out4, out5) \
|
||||
{ \
|
||||
LD_DP2_INC(psrc, stride, out0, out1); \
|
||||
LD_DP2_INC(psrc, stride, out2, out3); \
|
||||
LD_DP2_INC(psrc, stride, out4, out5); \
|
||||
}
|
||||
|
||||
#define LD_DP7_INC(psrc, stride, out0, \
|
||||
out1, out2, out3, \
|
||||
out4, out5, out6) \
|
||||
{ \
|
||||
LD_DP2_INC(psrc, stride, out0, out1); \
|
||||
LD_DP2_INC(psrc, stride, out2, out3); \
|
||||
LD_DP2_INC(psrc, stride, out4, out5); \
|
||||
out6 = LD_DP((psrc)); \
|
||||
(psrc) += stride; \
|
||||
}
|
||||
|
||||
#define LD_DP8_INC(psrc, stride, out0, out1, out2, \
|
||||
out3, out4, out5, out6, out7) \
|
||||
{ \
|
||||
LD_DP4_INC(psrc, stride, out0, out1, out2, out3); \
|
||||
LD_DP4_INC(psrc, stride, out4, out5, out6, out7); \
|
||||
}
|
||||
|
||||
#define LD_DP16_INC(psrc, stride, out0, out1, out2, \
|
||||
out3, out4, out5, out6, out7, out8, \
|
||||
out9, out10, out11, out12, out13, \
|
||||
out14, out15) \
|
||||
{ \
|
||||
LD_DP8_INC(psrc, stride, out0, out1, out2, \
|
||||
out3, out4, out5, out6, out7); \
|
||||
LD_DP8_INC(psrc, stride, out8, out9, out10, \
|
||||
out11, out12, out13, out14, out15); \
|
||||
}
|
||||
|
||||
/* Description : Store GP variable with stride
|
||||
Arguments : Inputs - in0, in1, pdst, stride
|
||||
Details : Store 4 single precision floating point elements from 'in0' to (pdst)
|
||||
Store 4 single precision floating point elements from 'in1' to (pdst + stride)
|
||||
*/
|
||||
#define ST_GP2_INC(in0, in1, \
|
||||
pdst, stride) \
|
||||
{ \
|
||||
*(pdst) = in0; \
|
||||
(pdst) += stride; \
|
||||
*(pdst) = in1; \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_GP3_INC(in0, in1, in2, \
|
||||
pdst, stride) \
|
||||
{ \
|
||||
ST_GP2_INC(in0, in1, pdst, stride); \
|
||||
*(pdst) = in2; \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_GP4_INC(in0, in1, in2, in3, \
|
||||
pdst, stride) \
|
||||
{ \
|
||||
ST_GP2_INC(in0, in1, pdst, stride); \
|
||||
ST_GP2_INC(in2, in3, pdst, stride); \
|
||||
}
|
||||
|
||||
#define ST_GP5_INC(in0, in1, in2, in3, \
|
||||
in4, pdst, stride) \
|
||||
{ \
|
||||
ST_GP2_INC(in0, in1, pdst, stride); \
|
||||
ST_GP2_INC(in2, in3, pdst, stride); \
|
||||
*(pdst) = in4; \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_GP6_INC(in0, in1, in2, in3, \
|
||||
in4, in5, pdst, stride) \
|
||||
{ \
|
||||
ST_GP2_INC(in0, in1, pdst, stride); \
|
||||
ST_GP2_INC(in2, in3, pdst, stride); \
|
||||
ST_GP2_INC(in4, in5, pdst, stride); \
|
||||
}
|
||||
|
||||
#define ST_GP7_INC(in0, in1, in2, in3, in4, \
|
||||
in5, in6, pdst, stride) \
|
||||
{ \
|
||||
ST_GP2_INC(in0, in1, pdst, stride); \
|
||||
ST_GP2_INC(in2, in3, pdst, stride); \
|
||||
ST_GP2_INC(in4, in5, pdst, stride); \
|
||||
*(pdst) = in6; \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_GP8_INC(in0, in1, in2, in3, in4, in5, \
|
||||
in6, in7, pdst, stride) \
|
||||
{ \
|
||||
ST_GP4_INC(in0, in1, in2, in3, pdst, stride); \
|
||||
ST_GP4_INC(in4, in5, in6, in7, pdst, stride); \
|
||||
}
|
||||
|
||||
/* Description : Store vectors of single precision floating point elements with stride
|
||||
Arguments : Inputs - in0, in1, pdst, stride
|
||||
Details : Store 4 single precision floating point elements from 'in0' to (pdst)
|
||||
|
@ -98,6 +379,73 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ST_SP4(in4, in5, in6, in7, (pdst + 4 * stride), stride); \
|
||||
}
|
||||
|
||||
#define ST_SP2_INC(in0, in1, pdst, stride) \
|
||||
{ \
|
||||
ST_SP(in0, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
ST_SP(in1, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_SP3_INC(in0, in1, in2, \
|
||||
pdst, stride) \
|
||||
{ \
|
||||
ST_SP2_INC(in0, in1, pdst, stride); \
|
||||
ST_SP(in2, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_SP4_INC(in0, in1, in2, in3, \
|
||||
pdst, stride) \
|
||||
{ \
|
||||
ST_SP2_INC(in0, in1, pdst, stride); \
|
||||
ST_SP2_INC(in2, in3, pdst, stride); \
|
||||
}
|
||||
|
||||
#define ST_SP5_INC(in0, in1, in2, in3, \
|
||||
in4, pdst, stride) \
|
||||
{ \
|
||||
ST_SP2_INC(in0, in1, pdst, stride); \
|
||||
ST_SP2_INC(in2, in3, pdst, stride); \
|
||||
ST_SP(in4, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_SP6_INC(in0, in1, in2, in3, \
|
||||
in4, in5, pdst, stride) \
|
||||
{ \
|
||||
ST_SP2_INC(in0, in1, pdst, stride); \
|
||||
ST_SP2_INC(in2, in3, pdst, stride); \
|
||||
ST_SP2_INC(in4, in5, pdst, stride); \
|
||||
}
|
||||
|
||||
#define ST_SP7_INC(in0, in1, in2, in3, in4, \
|
||||
in5, in6, pdst, stride) \
|
||||
{ \
|
||||
ST_SP2_INC(in0, in1, pdst, stride); \
|
||||
ST_SP2_INC(in2, in3, pdst, stride); \
|
||||
ST_SP2_INC(in4, in5, pdst, stride); \
|
||||
ST_SP(in6, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_SP8_INC(in0, in1, in2, in3, in4, in5, \
|
||||
in6, in7, pdst, stride) \
|
||||
{ \
|
||||
ST_SP4_INC(in0, in1, in2, in3, pdst, stride); \
|
||||
ST_SP4_INC(in4, in5, in6, in7, pdst, stride); \
|
||||
}
|
||||
|
||||
#define ST_SP16_INC(in0, in1, in2, in3, in4, in5, in6, \
|
||||
in7, in8, in9, in10, in11, in12, \
|
||||
in13, in14, in15, pdst, stride) \
|
||||
{ \
|
||||
ST_SP8_INC(in0, in1, in2, in3, in4, in5, in6, \
|
||||
in7, pdst, stride); \
|
||||
ST_SP8_INC(in8, in9, in10, in11, in12, in13, in14, \
|
||||
in15, pdst, stride); \
|
||||
}
|
||||
|
||||
/* Description : Store vectors of double precision floating point elements with stride
|
||||
Arguments : Inputs - in0, in1, pdst, stride
|
||||
Details : Store 2 double precision floating point elements from 'in0' to (pdst)
|
||||
|
@ -121,6 +469,104 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
ST_DP4(in4, in5, in6, in7, (pdst) + 4 * stride, stride); \
|
||||
}
|
||||
|
||||
#define ST_DP2_INC(in0, in1, pdst, stride) \
|
||||
{ \
|
||||
ST_DP(in0, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
ST_DP(in1, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_DP3_INC(in0, in1, in2, \
|
||||
pdst, stride) \
|
||||
{ \
|
||||
ST_DP2_INC(in0, in1, pdst, stride); \
|
||||
ST_DP(in2, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_DP4_INC(in0, in1, in2, in3, \
|
||||
pdst, stride) \
|
||||
{ \
|
||||
ST_DP2_INC(in0, in1, pdst, stride); \
|
||||
ST_DP2_INC(in2, in3, pdst, stride); \
|
||||
}
|
||||
|
||||
#define ST_DP5_INC(in0, in1, in2, in3, \
|
||||
in4, pdst, stride) \
|
||||
{ \
|
||||
ST_DP2_INC(in0, in1, pdst, stride); \
|
||||
ST_DP2_INC(in2, in3, pdst, stride); \
|
||||
ST_DP(in4, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_DP6_INC(in0, in1, in2, in3, \
|
||||
in4, in5, pdst, stride) \
|
||||
{ \
|
||||
ST_DP2_INC(in0, in1, pdst, stride); \
|
||||
ST_DP2_INC(in2, in3, pdst, stride); \
|
||||
ST_DP2_INC(in4, in5, pdst, stride); \
|
||||
}
|
||||
|
||||
#define ST_DP7_INC(in0, in1, in2, in3, in4, \
|
||||
in5, in6, pdst, stride) \
|
||||
{ \
|
||||
ST_DP2_INC(in0, in1, pdst, stride); \
|
||||
ST_DP2_INC(in2, in3, pdst, stride); \
|
||||
ST_DP2_INC(in4, in5, pdst, stride); \
|
||||
ST_DP(in6, (pdst)); \
|
||||
(pdst) += stride; \
|
||||
}
|
||||
|
||||
#define ST_DP8_INC(in0, in1, in2, in3, in4, in5, \
|
||||
in6, in7, pdst, stride) \
|
||||
{ \
|
||||
ST_DP4_INC(in0, in1, in2, in3, pdst, stride); \
|
||||
ST_DP4_INC(in4, in5, in6, in7, pdst, stride); \
|
||||
}
|
||||
|
||||
#define ST_DP16_INC(in0, in1, in2, in3, in4, in5, in6, \
|
||||
in7, in8, in9, in10, in11, in12, \
|
||||
in13, in14, in15, pdst, stride) \
|
||||
{ \
|
||||
ST_DP8_INC(in0, in1, in2, in3, in4, in5, in6, \
|
||||
in7, pdst, stride); \
|
||||
ST_DP8_INC(in8, in9, in10, in11, in12, in13, in14, \
|
||||
in15, pdst, stride); \
|
||||
}
|
||||
|
||||
/* Description : shuffle elements in vector as shf_val
|
||||
Arguments : Inputs - in0, in1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
*/
|
||||
#define SHF_W2(RTYPE, in0, in1, out0, out1, shf_val) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
|
||||
out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
|
||||
}
|
||||
#define SHF_W2_SP(...) SHF_W2(v4f32, __VA_ARGS__)
|
||||
#define SHF_W2_DP(...) SHF_W2(v2f64, __VA_ARGS__)
|
||||
|
||||
#define SHF_W3(RTYPE, in0, in1, in2, out0, out1, out2, \
|
||||
shf_val) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_shf_w((v4i32) in0, shf_val); \
|
||||
out1 = (RTYPE) __msa_shf_w((v4i32) in1, shf_val); \
|
||||
out2 = (RTYPE) __msa_shf_w((v4i32) in2, shf_val); \
|
||||
}
|
||||
#define SHF_W3_SP(...) SHF_W3(v4f32, __VA_ARGS__)
|
||||
|
||||
#define SHF_W4(RTYPE, in0, in1, in2, in3, \
|
||||
out0, out1, out2, out3, shf_val) \
|
||||
{ \
|
||||
SHF_W2(RTYPE, in0, in1, out0, out1, shf_val); \
|
||||
SHF_W2(RTYPE, in2, in3, out2, out3, shf_val); \
|
||||
}
|
||||
#define SHF_W4_SP(...) SHF_W4(v4f32, __VA_ARGS__)
|
||||
#define SHF_W4_DP(...) SHF_W4(v2f64, __VA_ARGS__)
|
||||
|
||||
/* Description : Interleave both left and right half of input vectors
|
||||
Arguments : Inputs - in0, in1
|
||||
Outputs - out0, out1
|
||||
|
@ -134,12 +580,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
out1 = (RTYPE) __msa_ilvl_w((v4i32) in0, (v4i32) in1); \
|
||||
}
|
||||
#define ILVRL_W2_SW(...) ILVRL_W2(v4i32, __VA_ARGS__)
|
||||
#define ILVRL_W2_SP(...) ILVRL_W2(v4f32, __VA_ARGS__)
|
||||
|
||||
#define ILVRL_D2(RTYPE, in0, in1, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_ilvr_d((v2i64) in0, (v2i64) in1); \
|
||||
out1 = (RTYPE) __msa_ilvl_d((v2i64) in0, (v2i64) in1); \
|
||||
}
|
||||
#define ILVRL_D2_SP(...) ILVRL_D2(v4f32, __VA_ARGS__)
|
||||
#define ILVRL_D2_DP(...) ILVRL_D2(v2f64, __VA_ARGS__)
|
||||
|
||||
/* Description : Indexed word element values are replicated to all
|
||||
|
@ -158,6 +606,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
out0 = (RTYPE) __msa_splati_w((v4i32) in, stidx); \
|
||||
out1 = (RTYPE) __msa_splati_w((v4i32) in, (stidx+1)); \
|
||||
}
|
||||
#define SPLATI_W2_SP(...) SPLATI_W2(v4f32, __VA_ARGS__)
|
||||
|
||||
#define SPLATI_W4(RTYPE, in, out0, out1, out2, out3) \
|
||||
{ \
|
||||
|
@ -166,22 +615,132 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
}
|
||||
#define SPLATI_W4_SP(...) SPLATI_W4(v4f32, __VA_ARGS__)
|
||||
|
||||
#define SPLATI_D2(RTYPE, in, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_splati_d((v2i64) in, 0); \
|
||||
out1 = (RTYPE) __msa_splati_d((v2i64) in, 1); \
|
||||
}
|
||||
#define SPLATI_D2_DP(...) SPLATI_D2(v2f64, __VA_ARGS__)
|
||||
|
||||
/* Description : Pack even double word elements of vector pairs
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Even double word elements of 'in0' are copied to the left half
|
||||
of 'out0' & even double word elements of 'in1' are copied to
|
||||
the right half of 'out0'.
|
||||
*/
|
||||
#define PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
|
||||
out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
|
||||
}
|
||||
#define PCKEV_D2_SP(...) PCKEV_D2(v4f32, __VA_ARGS__)
|
||||
#define PCKEV_D2_SD(...) PCKEV_D2(v2f64, __VA_ARGS__)
|
||||
|
||||
#define PCKEV_D3(RTYPE, in0, in1, in2, in3, in4, in5, \
|
||||
out0, out1, out2) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
|
||||
out1 = (RTYPE) __msa_pckev_d((v2i64) in2, (v2i64) in3); \
|
||||
out2 = (RTYPE) __msa_pckev_d((v2i64) in4, (v2i64) in5); \
|
||||
}
|
||||
#define PCKEV_D3_SP(...) PCKEV_D3(v4f32, __VA_ARGS__)
|
||||
|
||||
#define PCKEV_D4(RTYPE, in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3) \
|
||||
{ \
|
||||
PCKEV_D2(RTYPE, in0, in1, in2, in3, out0, out1); \
|
||||
PCKEV_D2(RTYPE, in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
#define PCKEV_D4_SP(...) PCKEV_D4(v4f32, __VA_ARGS__)
|
||||
|
||||
/* Description : pack both even and odd half of input vectors
|
||||
Arguments : Inputs - in0, in1
|
||||
Outputs - out0, out1
|
||||
Return Type - as per RTYPE
|
||||
Details : Even double word elements of 'in0' and 'in1' are copied to the
|
||||
'out0' & odd double word elements of 'in0' and 'in1' are
|
||||
copied to the 'out1'.
|
||||
*/
|
||||
#define PCKEVOD_W2(RTYPE, in0, in1, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_pckev_w((v4i32) in0, (v4i32) in1); \
|
||||
out1 = (RTYPE) __msa_pckod_w((v4i32) in0, (v4i32) in1); \
|
||||
}
|
||||
#define PCKEVOD_W2_SP(...) PCKEVOD_W2(v4f32, __VA_ARGS__)
|
||||
|
||||
#define PCKEVOD_D2(RTYPE, in0, in1, out0, out1) \
|
||||
{ \
|
||||
out0 = (RTYPE) __msa_pckev_d((v2i64) in0, (v2i64) in1); \
|
||||
out1 = (RTYPE) __msa_pckod_d((v2i64) in0, (v2i64) in1); \
|
||||
}
|
||||
#define PCKEVOD_D2_DP(...) PCKEVOD_D2(v2f64, __VA_ARGS__)
|
||||
|
||||
/* Description : Multiplication of pairs of vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1
|
||||
Details : Each element from 'in0' is multiplied with elements from 'in1'
|
||||
and the result is written to 'out0'
|
||||
*/
|
||||
#define MUL2(in0, in1, in2, in3, out0, out1) \
|
||||
{ \
|
||||
out0 = in0 * in1; \
|
||||
out1 = in2 * in3; \
|
||||
}
|
||||
#define MUL3(in0, in1, in2, in3, in4, in5, \
|
||||
out0, out1, out2) \
|
||||
{ \
|
||||
out0 = in0 * in1; \
|
||||
out1 = in2 * in3; \
|
||||
out2 = in4 * in5; \
|
||||
}
|
||||
#define MUL4(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3) \
|
||||
{ \
|
||||
MUL2(in0, in1, in2, in3, out0, out1); \
|
||||
MUL2(in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
|
||||
/* Description : Addition of 2 pairs of variables
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1
|
||||
Details : Each element in 'in0' is added to 'in1' and result is written
|
||||
to 'out0'.
|
||||
*/
|
||||
#define ADD2(in0, in1, in2, in3, out0, out1) \
|
||||
{ \
|
||||
out0 = in0 + in1; \
|
||||
out1 = in2 + in3; \
|
||||
}
|
||||
#define ADD3(in0, in1, in2, in3, in4, in5, \
|
||||
out0, out1, out2) \
|
||||
{ \
|
||||
out0 = in0 + in1; \
|
||||
out1 = in2 + in3; \
|
||||
out2 = in4 + in5; \
|
||||
}
|
||||
#define ADD4(in0, in1, in2, in3, in4, in5, in6, in7, \
|
||||
out0, out1, out2, out3) \
|
||||
{ \
|
||||
ADD2(in0, in1, in2, in3, out0, out1); \
|
||||
ADD2(in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
|
||||
/* Description : Transpose 4x4 block with word elements in vectors
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1, out2, out3
|
||||
Return Type - as per RTYPE
|
||||
*/
|
||||
#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, out0, out1, out2, out3) \
|
||||
{ \
|
||||
v4i32 s0_m, s1_m, s2_m, s3_m; \
|
||||
\
|
||||
ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
|
||||
ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
|
||||
\
|
||||
out0 = (RTYPE) __msa_ilvr_d((v2i64) s2_m, (v2i64) s0_m); \
|
||||
out1 = (RTYPE) __msa_ilvl_d((v2i64) s2_m, (v2i64) s0_m); \
|
||||
out2 = (RTYPE) __msa_ilvr_d((v2i64) s3_m, (v2i64) s1_m); \
|
||||
out3 = (RTYPE) __msa_ilvl_d((v2i64) s3_m, (v2i64) s1_m); \
|
||||
#define TRANSPOSE4x4_W(RTYPE, in0, in1, in2, in3, \
|
||||
out0, out1, out2, out3) \
|
||||
{ \
|
||||
v4i32 s0_m, s1_m, s2_m, s3_m; \
|
||||
\
|
||||
ILVRL_W2_SW(in1, in0, s0_m, s1_m); \
|
||||
ILVRL_W2_SW(in3, in2, s2_m, s3_m); \
|
||||
ILVRL_D2(RTYPE, s2_m, s0_m, out0, out1); \
|
||||
ILVRL_D2(RTYPE, s3_m, s1_m, out2, out3); \
|
||||
}
|
||||
#define TRANSPOSE4x4_SP_SP(...) TRANSPOSE4x4_W(v4f32, __VA_ARGS__)
|
||||
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
||||
FLOAT * __restrict dst)
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0;
|
||||
FLOAT *psrc1, *psrc2, *psrc3, *psrc4;
|
||||
FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
|
||||
FLOAT *pdst;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
|
||||
FLOAT *psrc8, *pdst;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
v4f32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
|
||||
|
@ -58,22 +55,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (m >> 3); i--;)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
LD_SP2(psrc3, 4, src4, src5);
|
||||
LD_SP2(psrc4, 4, src6, src7);
|
||||
LD_SP2(psrc5, 4, src8, src9);
|
||||
LD_SP2(psrc6, 4, src10, src11);
|
||||
LD_SP2(psrc7, 4, src12, src13);
|
||||
LD_SP2(psrc8, 4, src14, src15);
|
||||
psrc1 += 8;
|
||||
psrc2 += 8;
|
||||
psrc3 += 8;
|
||||
psrc4 += 8;
|
||||
psrc5 += 8;
|
||||
psrc6 += 8;
|
||||
psrc7 += 8;
|
||||
psrc8 += 8;
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
LD_SP2_INC(psrc3, 4, src4, src5);
|
||||
LD_SP2_INC(psrc4, 4, src6, src7);
|
||||
LD_SP2_INC(psrc5, 4, src8, src9);
|
||||
LD_SP2_INC(psrc6, 4, src10, src11);
|
||||
LD_SP2_INC(psrc7, 4, src12, src13);
|
||||
LD_SP2_INC(psrc8, 4, src14, src15);
|
||||
|
||||
TRANSPOSE4x4_SP_SP(src0, src2, src4, src6, dst0, dst2, dst4, dst6);
|
||||
TRANSPOSE4x4_SP_SP(src8, src10, src12, src14, dst1, dst3, dst5,
|
||||
|
@ -83,15 +72,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
TRANSPOSE4x4_SP_SP(src9, src11, src13, src15, dst9, dst11, dst13,
|
||||
dst15);
|
||||
|
||||
ST_SP2(dst0, dst1, pdst, 4);
|
||||
ST_SP2(dst2, dst3, pdst + 8, 4);
|
||||
ST_SP2(dst4, dst5, pdst + 16, 4);
|
||||
ST_SP2(dst6, dst7, pdst + 24, 4);
|
||||
ST_SP2(dst8, dst9, pdst + 32, 4);
|
||||
ST_SP2(dst10, dst11, pdst + 40, 4);
|
||||
ST_SP2(dst12, dst13, pdst + 48, 4);
|
||||
ST_SP2(dst14, dst15, pdst + 56, 4);
|
||||
pdst += 64;
|
||||
ST_SP2_INC(dst0, dst1, pdst, 4);
|
||||
ST_SP2_INC(dst2, dst3, pdst, 4);
|
||||
ST_SP2_INC(dst4, dst5, pdst, 4);
|
||||
ST_SP2_INC(dst6, dst7, pdst, 4);
|
||||
ST_SP2_INC(dst8, dst9, pdst, 4);
|
||||
ST_SP2_INC(dst10, dst11, pdst, 4);
|
||||
ST_SP2_INC(dst12, dst13, pdst, 4);
|
||||
ST_SP2_INC(dst14, dst15, pdst, 4);
|
||||
}
|
||||
|
||||
for (i = (m & 7); i--;)
|
||||
|
@ -128,9 +116,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
TRANSPOSE4x4_SP_SP(src0, src1, src2, src3, dst0, dst1, dst2, dst3);
|
||||
|
||||
ST_SP2(dst0, dst1, pdst, 4);
|
||||
ST_SP2(dst2, dst3, pdst + 8, 4);
|
||||
pdst += 16;
|
||||
ST_SP2_INC(dst0, dst1, pdst, 4);
|
||||
ST_SP2_INC(dst2, dst3, pdst, 4);
|
||||
}
|
||||
|
||||
for (i = (m & 3); i--;)
|
||||
|
|
|
@ -28,14 +28,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
||||
FLOAT * __restrict dst)
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0;
|
||||
FLOAT *psrc1, *psrc2, *psrc3, *psrc4;
|
||||
FLOAT *psrc5, *psrc6, *psrc7, *psrc8;
|
||||
FLOAT *pdst0, *pdst1, *pdst2, *pdst3, *pdst4;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *psrc5, *psrc6, *psrc7;
|
||||
FLOAT *psrc8, *pdst0, *pdst1, *pdst2, *pdst3, *pdst4;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
|
||||
|
@ -63,22 +60,14 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
LD_SP2(psrc3, 4, src4, src5);
|
||||
LD_SP2(psrc4, 4, src6, src7);
|
||||
LD_SP2(psrc5, 4, src8, src9);
|
||||
LD_SP2(psrc6, 4, src10, src11);
|
||||
LD_SP2(psrc7, 4, src12, src13);
|
||||
LD_SP2(psrc8, 4, src14, src15);
|
||||
psrc1 += 8;
|
||||
psrc2 += 8;
|
||||
psrc3 += 8;
|
||||
psrc4 += 8;
|
||||
psrc5 += 8;
|
||||
psrc6 += 8;
|
||||
psrc7 += 8;
|
||||
psrc8 += 8;
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
LD_SP2_INC(psrc3, 4, src4, src5);
|
||||
LD_SP2_INC(psrc4, 4, src6, src7);
|
||||
LD_SP2_INC(psrc5, 4, src8, src9);
|
||||
LD_SP2_INC(psrc6, 4, src10, src11);
|
||||
LD_SP2_INC(psrc7, 4, src12, src13);
|
||||
LD_SP2_INC(psrc8, 4, src14, src15);
|
||||
|
||||
ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
|
||||
ST_SP8(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
|
@ -105,8 +94,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
psrc7 += 4;
|
||||
psrc8 += 4;
|
||||
|
||||
ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4);
|
||||
pdst2 += 32;
|
||||
ST_SP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 4);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -155,14 +143,10 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
LD_SP2(psrc3, 4, src4, src5);
|
||||
LD_SP2(psrc4, 4, src6, src7);
|
||||
psrc1 += 8;
|
||||
psrc2 += 8;
|
||||
psrc3 += 8;
|
||||
psrc4 += 8;
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
LD_SP2_INC(psrc3, 4, src4, src5);
|
||||
LD_SP2_INC(psrc4, 4, src6, src7);
|
||||
|
||||
ST_SP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 4);
|
||||
pdst1 += 8 * m;
|
||||
|
@ -179,8 +163,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
psrc3 += 4;
|
||||
psrc4 += 4;
|
||||
|
||||
ST_SP4(src0, src1, src2, src3, pdst2, 4);
|
||||
pdst2 += 16;
|
||||
ST_SP4_INC(src0, src1, src2, src3, pdst2, 4);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -215,10 +198,8 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
LD_SP2(psrc2, 4, src2, src3);
|
||||
psrc1 += 8;
|
||||
psrc2 += 8;
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
LD_SP2_INC(psrc2, 4, src2, src3);
|
||||
|
||||
ST_SP4(src0, src1, src2, src3, pdst1, 4);
|
||||
pdst1 += 8 * m;
|
||||
|
@ -231,8 +212,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
psrc1 += 4;
|
||||
psrc2 += 4;
|
||||
|
||||
ST_SP2(src0, src1, pdst2, 4);
|
||||
pdst2 += 8;
|
||||
ST_SP2_INC(src0, src1, pdst2, 4);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
|
@ -260,8 +240,7 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_SP2(psrc1, 4, src0, src1);
|
||||
psrc1 += 8;
|
||||
LD_SP2_INC(psrc1, 4, src0, src1);
|
||||
|
||||
ST_SP2(src0, src1, pdst1, 4);
|
||||
pdst1 += 8 * m;
|
||||
|
@ -288,5 +267,5 @@ int CNAME(BLASLONG m, BLASLONG n, FLOAT * __restrict src, BLASLONG lda,
|
|||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -166,7 +166,7 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
src_a = LD_SP(a + 32);
|
||||
SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36);
|
||||
src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
|
||||
|
||||
res_c4 *= src_a36;
|
||||
res_c12 *= src_a36;
|
||||
|
@ -220,9 +220,9 @@ static void ssolve_8x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
res_c0 -= res_c2 * src_a16;
|
||||
res_c8 -= res_c10 * src_a16;
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
|
||||
src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
|
||||
src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
|
||||
src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
|
||||
|
||||
res_c1 *= src_a9;
|
||||
res_c9 *= src_a9;
|
||||
|
@ -306,7 +306,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
bb += 4;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
|
@ -374,7 +374,7 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
src_a = LD_SP(a + 32);
|
||||
SPLATI_W4_SP(src_a, src_a32, src_a33, src_a34, src_a35);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 36), src_a36);
|
||||
src_a36 = COPY_FLOAT_TO_VECTOR(*(a + 36));
|
||||
|
||||
res_c4 *= src_a36;
|
||||
res_c3 -= res_c4 * src_a35;
|
||||
|
@ -399,9 +399,9 @@ static void ssolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
res_c1 -= res_c2 * src_a17;
|
||||
res_c0 -= res_c2 * src_a16;
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 9), src_a9);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 8), src_a8);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
|
||||
src_a9 = COPY_FLOAT_TO_VECTOR(*(a + 9));
|
||||
src_a8 = COPY_FLOAT_TO_VECTOR(*(a + 8));
|
||||
src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
|
||||
|
||||
res_c1 *= src_a9;
|
||||
res_c0 -= res_c1 * src_a8;
|
||||
|
@ -826,9 +826,9 @@ static void ssolve_4x8_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
|
||||
src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
|
||||
src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
|
||||
src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
|
||||
src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
|
||||
|
||||
res_c3 *= src_a15;
|
||||
res_c7 *= src_a15;
|
||||
|
@ -916,7 +916,7 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
bb += 4;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
src_a0 = LD_SP(aa);
|
||||
|
||||
|
@ -940,9 +940,9 @@ static void ssolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_a10 = (v4f32) __msa_splati_w((v4i32) src_a8, 2);
|
||||
src_a9 = (v4f32) __msa_splati_w((v4i32) src_a8, 1);
|
||||
src_a8 = (v4f32) __msa_splati_w((v4i32) src_a8, 0);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 5), src_a5);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 4), src_a4);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 0), src_a0);
|
||||
src_a5 = COPY_FLOAT_TO_VECTOR(*(a + 5));
|
||||
src_a4 = COPY_FLOAT_TO_VECTOR(*(a + 4));
|
||||
src_a0 = COPY_FLOAT_TO_VECTOR(*(a + 0));
|
||||
|
||||
res_c3 *= src_a15;
|
||||
res_c2 -= res_c3 * src_a14;
|
||||
|
|
|
@ -162,7 +162,7 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
src_a = LD_SP(a + 27);
|
||||
SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31);
|
||||
src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
|
||||
|
||||
res_c3 *= src_a27;
|
||||
res_c11 *= src_a27;
|
||||
|
@ -216,9 +216,9 @@ static void ssolve_8x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
res_c7 -= res_c5 * src_a47;
|
||||
res_c15 -= res_c13 * src_a47;
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63);
|
||||
src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
|
||||
src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
|
||||
src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
|
||||
|
||||
res_c6 *= src_a54;
|
||||
res_c14 *= src_a54;
|
||||
|
@ -334,7 +334,7 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
src_a = LD_SP(a + 27);
|
||||
SPLATI_W4_SP(src_a, src_a27, src_a28, src_a29, src_a30);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 31), src_a31);
|
||||
src_a31 = COPY_FLOAT_TO_VECTOR(*(a + 31));
|
||||
|
||||
res_c3 *= src_a27;
|
||||
res_c4 -= res_c3 * src_a28;
|
||||
|
@ -359,9 +359,9 @@ static void ssolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
res_c6 -= res_c5 * src_a46;
|
||||
res_c7 -= res_c5 * src_a47;
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 54), src_a54);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 55), src_a55);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 63), src_a63);
|
||||
src_a54 = COPY_FLOAT_TO_VECTOR(*(a + 54));
|
||||
src_a55 = COPY_FLOAT_TO_VECTOR(*(a + 55));
|
||||
src_a63 = COPY_FLOAT_TO_VECTOR(*(a + 63));
|
||||
|
||||
res_c6 *= src_a54;
|
||||
res_c7 -= res_c6 * src_a55;
|
||||
|
@ -780,7 +780,7 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
b += 8;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
src_a0 = LD_SP(a);
|
||||
|
||||
|
@ -813,9 +813,9 @@ static void ssolve_4x8_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
|
||||
src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
|
||||
src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15);
|
||||
src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
|
||||
src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
|
||||
src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
|
||||
|
||||
res_c0 *= src_a0;
|
||||
res_c4 *= src_a0;
|
||||
|
@ -902,7 +902,7 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
b += 4;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
src_a0 = LD_SP(a);
|
||||
|
||||
|
@ -926,9 +926,9 @@ static void ssolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_a7 = (v4f32) __msa_splati_w((v4i32) src_a5, 2);
|
||||
src_a6 = (v4f32) __msa_splati_w((v4i32) src_a5, 1);
|
||||
src_a5 = (v4f32) __msa_splati_w((v4i32) src_a5, 0);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 10), src_a10);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 11), src_a11);
|
||||
COPY_FLOAT_TO_VECTOR(*(a + 15), src_a15);
|
||||
src_a10 = COPY_FLOAT_TO_VECTOR(*(a + 10));
|
||||
src_a11 = COPY_FLOAT_TO_VECTOR(*(a + 11));
|
||||
src_a15 = COPY_FLOAT_TO_VECTOR(*(a + 15));
|
||||
|
||||
res_c0 *= src_a0;
|
||||
res_c1 -= res_c0 * src_a1;
|
||||
|
|
|
@ -144,7 +144,7 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
src_b = LD_SP(b + 27);
|
||||
SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31);
|
||||
src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
|
||||
|
||||
src_c4 *= src_b18;
|
||||
src_c5 *= src_b18;
|
||||
|
@ -184,9 +184,9 @@ static void ssolve_8x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
|
||||
src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63);
|
||||
src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
|
||||
src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
|
||||
src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
|
||||
|
||||
src_c8 *= src_b36;
|
||||
src_c9 *= src_b36;
|
||||
|
@ -275,7 +275,7 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
b += 4;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
|
@ -300,9 +300,9 @@ static void ssolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
|
||||
src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
|
||||
src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15);
|
||||
src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
|
||||
src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
|
||||
src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
|
@ -351,8 +351,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
{
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -364,8 +364,8 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -376,12 +376,12 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
b += 2;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -392,9 +392,9 @@ static void ssolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
b += 2;
|
||||
}
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
|
||||
src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
|
@ -419,7 +419,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
{
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -429,7 +429,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -439,7 +439,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -449,7 +449,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -458,13 +458,13 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
b += 1;
|
||||
}
|
||||
|
||||
if (bk & 3)
|
||||
if ((bk & 3) && (bk > 0))
|
||||
{
|
||||
if (bk & 2)
|
||||
{
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -474,7 +474,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -487,7 +487,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
{
|
||||
LD_SP2(a, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -497,7 +497,7 @@ static void ssolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
}
|
||||
}
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
|
@ -574,7 +574,7 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
src_b = LD_SP(b + 27);
|
||||
SPLATI_W4_SP(src_b, src_b27, src_b28, src_b29, src_b30);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 31), src_b31);
|
||||
src_b31 = COPY_FLOAT_TO_VECTOR(*(b + 31));
|
||||
|
||||
src_b = LD_SP(b + 36);
|
||||
SPLATI_W4_SP(src_b, src_b36, src_b37, src_b38, src_b39);
|
||||
|
@ -584,9 +584,9 @@ static void ssolve_4x8_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_b46 = (v4f32) __msa_splati_w((v4i32) src_b45, 1);
|
||||
src_b45 = (v4f32) __msa_splati_w((v4i32) src_b45, 0);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 54), src_b54);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 55), src_b55);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 63), src_b63);
|
||||
src_b54 = COPY_FLOAT_TO_VECTOR(*(b + 54));
|
||||
src_b55 = COPY_FLOAT_TO_VECTOR(*(b + 55));
|
||||
src_b63 = COPY_FLOAT_TO_VECTOR(*(b + 63));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 -= src_c0 * src_b1;
|
||||
|
@ -686,7 +686,7 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
b += 4;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
src_a0 = LD_SP(a);
|
||||
|
||||
|
@ -707,9 +707,9 @@ static void ssolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_b7 = (v4f32) __msa_splati_w((v4i32) src_b5, 2);
|
||||
src_b6 = (v4f32) __msa_splati_w((v4i32) src_b5, 1);
|
||||
src_b5 = (v4f32) __msa_splati_w((v4i32) src_b5, 0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 10), src_b10);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 11), src_b11);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 15), src_b15);
|
||||
src_b10 = COPY_FLOAT_TO_VECTOR(*(b + 10));
|
||||
src_b11 = COPY_FLOAT_TO_VECTOR(*(b + 11));
|
||||
src_b15 = COPY_FLOAT_TO_VECTOR(*(b + 15));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 -= src_c0 * src_b1;
|
||||
|
@ -789,7 +789,7 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
b += 2;
|
||||
}
|
||||
|
||||
if (bk & 3)
|
||||
if ((bk & 3) && (bk > 0))
|
||||
{
|
||||
if (bk & 2)
|
||||
{
|
||||
|
@ -831,9 +831,9 @@ static void ssolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
}
|
||||
}
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 1), src_b1);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
src_b1 = COPY_FLOAT_TO_VECTOR(*(b + 1));
|
||||
src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 -= src_c0 * src_b1;
|
||||
|
|
|
@ -158,7 +158,7 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
src_b = LD_SP(b + 32);
|
||||
SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36);
|
||||
src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
|
||||
|
||||
src_c8 *= src_b36;
|
||||
src_c9 *= src_b36;
|
||||
|
@ -203,9 +203,9 @@ static void ssolve_8x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
ST_SP2(src_c4, src_c5, c_nxt2line, 4);
|
||||
ST_SP2(src_c6, src_c7, c_nxt3line, 4);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
|
||||
src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c2 *= src_b9;
|
||||
src_c3 *= src_b9;
|
||||
|
@ -273,7 +273,7 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
bb += 4;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
|
@ -298,9 +298,9 @@ static void ssolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
|
||||
src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
|
||||
src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
|
||||
src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c7 *= src_b15;
|
||||
src_c6 *= src_b15;
|
||||
|
@ -350,8 +350,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
{
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -363,8 +363,8 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -375,12 +375,12 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
bb += 2;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 1), src_b1);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
src_b1 = COPY_FLOAT_TO_VECTOR(*(bb + 1));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -391,9 +391,9 @@ static void ssolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
a -= 16;
|
||||
b -= 4;
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
|
||||
src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
|
||||
|
||||
src_c2 *= src_b3;
|
||||
src_c3 *= src_b3;
|
||||
|
@ -419,7 +419,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
|||
{
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -429,7 +429,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
|||
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -439,7 +439,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
|||
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -449,7 +449,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
|||
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -458,13 +458,13 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
|||
bb += 1;
|
||||
}
|
||||
|
||||
if (bk & 3)
|
||||
if ((bk & 3) && (bk > 0))
|
||||
{
|
||||
if (bk & 2)
|
||||
{
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -474,7 +474,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
|||
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -487,7 +487,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
|||
{
|
||||
LD_SP2(aa, 4, src_a0, src_a1);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(bb + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(bb + 0));
|
||||
|
||||
src_c0 -= src_a0 * src_b0;
|
||||
src_c1 -= src_a1 * src_b0;
|
||||
|
@ -497,7 +497,7 @@ static void ssolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
|
|||
a -= 8;
|
||||
b -= 1;
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c0 *= src_b0;
|
||||
src_c1 *= src_b0;
|
||||
|
@ -579,7 +579,7 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
|
||||
src_b = LD_SP(b + 32);
|
||||
SPLATI_W4_SP(src_b, src_b32, src_b33, src_b34, src_b35);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 36), src_b36);
|
||||
src_b36 = COPY_FLOAT_TO_VECTOR(*(b + 36));
|
||||
|
||||
src_b = LD_SP(b + 24);
|
||||
SPLATI_W4_SP(src_b, src_b24, src_b25, src_b26, src_b27);
|
||||
|
@ -589,9 +589,9 @@ static void ssolve_4x8_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_b17 = (v4f32) __msa_splati_w((v4i32) src_b16, 1);
|
||||
src_b16 = (v4f32) __msa_splati_w((v4i32) src_b16, 0);
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 9), src_b9);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 8), src_b8);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b9 = COPY_FLOAT_TO_VECTOR(*(b + 9));
|
||||
src_b8 = COPY_FLOAT_TO_VECTOR(*(b + 8));
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c7 *= src_b63;
|
||||
src_c6 -= src_c7 * src_b62;
|
||||
|
@ -695,7 +695,7 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
bb += 4;
|
||||
}
|
||||
|
||||
if (bk & 1)
|
||||
if ((bk & 1) && (bk > 0))
|
||||
{
|
||||
src_a = LD_SP(aa);
|
||||
|
||||
|
@ -717,9 +717,9 @@ static void ssolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
src_b10 = (v4f32) __msa_splati_w((v4i32) src_b8, 2);
|
||||
src_b9 = (v4f32) __msa_splati_w((v4i32) src_b8, 1);
|
||||
src_b8 = (v4f32) __msa_splati_w((v4i32) src_b8, 0);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 5), src_b5);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 4), src_b4);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b5 = COPY_FLOAT_TO_VECTOR(*(b + 5));
|
||||
src_b4 = COPY_FLOAT_TO_VECTOR(*(b + 4));
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c3 *= src_b15;
|
||||
src_c2 -= src_c3 * src_b14;
|
||||
|
@ -800,7 +800,7 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
bb += 2;
|
||||
}
|
||||
|
||||
if (bk & 3)
|
||||
if ((bk & 3) && (bk > 0))
|
||||
{
|
||||
if (bk & 2)
|
||||
{
|
||||
|
@ -842,9 +842,9 @@ static void ssolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
|
|||
a -= 8;
|
||||
b -= 4;
|
||||
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 3), src_b3);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 2), src_b2);
|
||||
COPY_FLOAT_TO_VECTOR(*(b + 0), src_b0);
|
||||
src_b3 = COPY_FLOAT_TO_VECTOR(*(b + 3));
|
||||
src_b2 = COPY_FLOAT_TO_VECTOR(*(b + 2));
|
||||
src_b0 = COPY_FLOAT_TO_VECTOR(*(b + 0));
|
||||
|
||||
src_c1 *= src_b3;
|
||||
src_c0 -= src_c1 * src_b2;
|
||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,144 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4, *pdst;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
|
||||
psrc0 = src;
|
||||
pdst = dst;
|
||||
lda *= 2;
|
||||
|
||||
for (j = (n >> 2); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
|
||||
LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
|
||||
|
||||
ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2);
|
||||
ST_DP8_INC(src2, src6, src10, src14, src3, src7, src11, src15,
|
||||
pdst, 2);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src4, src5);
|
||||
LD_DP2_INC(psrc3, 2, src8, src9);
|
||||
LD_DP2_INC(psrc4, 2, src12, src13);
|
||||
|
||||
ST_DP8_INC(src0, src4, src8, src12, src1, src5, src9, src13, pdst, 2);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src4 = LD_DP(psrc2);
|
||||
src8 = LD_DP(psrc3);
|
||||
src12 = LD_DP(psrc4);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
ST_DP4_INC(src0, src4, src8, src12, pdst, 2);
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
|
||||
ST_DP8_INC(src0, src4, src1, src5, src2, src6, src3, src7, pdst, 2);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src4, src5);
|
||||
|
||||
ST_DP4_INC(src0, src4, src1, src5, pdst, 2);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src4 = LD_DP(psrc2);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
|
||||
ST_DP2_INC(src0, src4, pdst, 2);
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
|
||||
for (i = (m >> 2); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
ST_DP4_INC(src0, src1, src2, src3, pdst, 2);
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
ST_DP2_INC(src0, src1, pdst, 2);
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
ST_DP(src0, pdst);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,161 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *src, BLASLONG lda, FLOAT *dst)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
FLOAT *psrc0, *psrc1, *psrc2, *psrc3, *psrc4;
|
||||
FLOAT *pdst0, *pdst1, *pdst2, *pdst3;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 src8, src9, src10, src11, src12, src13, src14, src15;
|
||||
|
||||
psrc0 = src;
|
||||
pdst0 = dst;
|
||||
lda *= 2;
|
||||
|
||||
pdst2 = dst + 2 * m * (n & ~3);
|
||||
pdst3 = dst + 2 * m * (n & ~1);
|
||||
|
||||
for (j = (m >> 2); j--;)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc3 = psrc2 + lda;
|
||||
psrc4 = psrc3 + lda;
|
||||
psrc0 += 4 * lda;
|
||||
|
||||
pdst1 = pdst0;
|
||||
pdst0 += 32;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
LD_DP4_INC(psrc3, 2, src8, src9, src10, src11);
|
||||
LD_DP4_INC(psrc4, 2, src12, src13, src14, src15);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
ST_DP8(src8, src9, src10, src11, src12, src13, src14, src15,
|
||||
pdst1 + 16, 2);
|
||||
pdst1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
LD_DP2_INC(psrc3, 2, src4, src5);
|
||||
LD_DP2_INC(psrc4, 2, src6, src7);
|
||||
|
||||
ST_DP8_INC(src0, src1, src2, src3, src4, src5, src6, src7, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src1 = LD_DP(psrc2);
|
||||
src2 = LD_DP(psrc3);
|
||||
src3 = LD_DP(psrc4);
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
psrc3 += 2;
|
||||
psrc4 += 2;
|
||||
|
||||
ST_DP4_INC(src0, src1, src2, src3, pdst3, 2);
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 2)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
psrc2 = psrc1 + lda;
|
||||
psrc0 += 2 * lda;
|
||||
|
||||
pdst1 = pdst0;
|
||||
pdst0 += 16;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
LD_DP4_INC(psrc2, 2, src4, src5, src6, src7);
|
||||
|
||||
ST_DP8(src0, src1, src2, src3, src4, src5, src6, src7, pdst1, 2);
|
||||
|
||||
pdst1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
LD_DP2_INC(psrc2, 2, src2, src3);
|
||||
|
||||
ST_DP4_INC(src0, src1, src2, src3, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
src1 = LD_DP(psrc2);
|
||||
|
||||
ST_DP2_INC(src0, src1, pdst3, 2);
|
||||
|
||||
psrc1 += 2;
|
||||
psrc2 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 1)
|
||||
{
|
||||
psrc1 = psrc0;
|
||||
pdst1 = pdst0;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_DP4_INC(psrc1, 2, src0, src1, src2, src3);
|
||||
ST_DP4(src0, src1, src2, src3, pdst1, 2);
|
||||
|
||||
pdst1 += m * 8;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(psrc1, 2, src0, src1);
|
||||
ST_DP2_INC(src0, src1, pdst2, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
src0 = LD_DP(psrc1);
|
||||
ST_DP(src0, pdst3);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
18
param.h
18
param.h
|
@ -2188,11 +2188,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 128
|
||||
|
@ -2227,11 +2227,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define DGEMM_DEFAULT_UNROLL_M 8
|
||||
#define DGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 2
|
||||
#define CGEMM_DEFAULT_UNROLL_N 2
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 2
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
#define SGEMM_DEFAULT_P 128
|
||||
#define DGEMM_DEFAULT_P 128
|
||||
|
|
Loading…
Reference in New Issue