Add msa optimization for AXPY, COPY, SCALE, SWAP

Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
This commit is contained in:
kaustubh 2017-01-09 18:22:09 +05:30
parent 2ffbbb54f6
commit 88afb3bc94
18 changed files with 5835 additions and 0 deletions

View File

@ -42,15 +42,29 @@ CASUMKERNEL = ../mips/asum.c
ZASUMKERNEL = ../mips/asum.c
endif
ifdef HAVE_MSA
SAXPYKERNEL = ../mips/saxpy_msa.c
DAXPYKERNEL = ../mips/daxpy_msa.c
CAXPYKERNEL = ../mips/caxpy_msa.c
ZAXPYKERNEL = ../mips/zaxpy_msa.c
else
SAXPYKERNEL = ../mips/axpy.c
DAXPYKERNEL = ../mips/axpy.c
CAXPYKERNEL = ../mips/zaxpy.c
ZAXPYKERNEL = ../mips/zaxpy.c
endif
ifdef HAVE_MSA
SCOPYKERNEL = ../mips/scopy_msa.c
DCOPYKERNEL = ../mips/dcopy_msa.c
CCOPYKERNEL = ../mips/ccopy_msa.c
ZCOPYKERNEL = ../mips/zcopy_msa.c
else
SCOPYKERNEL = ../mips/copy.c
DCOPYKERNEL = ../mips/copy.c
CCOPYKERNEL = ../mips/zcopy.c
ZCOPYKERNEL = ../mips/zcopy.c
endif
ifdef HAVE_MSA
SDOTKERNEL = ../mips/sdot_msa.c
@ -74,15 +88,29 @@ DROTKERNEL = ../mips/rot.c
CROTKERNEL = ../mips/zrot.c
ZROTKERNEL = ../mips/zrot.c
ifdef HAVE_MSA
SSCALKERNEL = ../mips/sscal_msa.c
DSCALKERNEL = ../mips/dscal_msa.c
CSCALKERNEL = ../mips/cscal_msa.c
ZSCALKERNEL = ../mips/zscal_msa.c
else
SSCALKERNEL = ../mips/scal.c
DSCALKERNEL = ../mips/scal.c
CSCALKERNEL = ../mips/zscal.c
ZSCALKERNEL = ../mips/zscal.c
endif
ifdef HAVE_MSA
SSWAPKERNEL = ../mips/sswap.c
DSWAPKERNEL = ../mips/dswap.c
CSWAPKERNEL = ../mips/cswap.c
ZSWAPKERNEL = ../mips/zswap.c
else
SSWAPKERNEL = ../mips/swap.c
DSWAPKERNEL = ../mips/swap.c
CSWAPKERNEL = ../mips/zswap.c
ZSWAPKERNEL = ../mips/zswap.c
endif
ifdef HAVE_MSA
SGEMVNKERNEL = ../mips/sgemv_n_msa.c

471
kernel/mips/caxpy_msa.c Normal file
View File

@ -0,0 +1,471 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
#if !defined(CONJ)
#define OP0 +=
#define OP1 -=
#define OP2 +=
#else
#define OP0 -=
#define OP1 +=
#define OP2 -=
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i, inc_x2, inc_y2;
FLOAT *py;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7;
v4f32 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec;
v4f32 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i;
v4f32 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i;
FLOAT xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7;
FLOAT yd0, yd1, yd2, yd3, yd4, yd5, yd6, yd7;
if (n < 0) return(0);
if ((da_r == 0.0) && (da_i == 0.0)) return(0);
py = y;
if ((1 == inc_x) && (1 == inc_y))
{
FLOAT *x_pref, *y_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64;
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 64;
dar_vec = COPY_FLOAT_TO_VECTOR(da_r);
dai_vec = COPY_FLOAT_TO_VECTOR(da_i);
for (i = (n >> 4); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
x_pref += 32;
y_pref += 32;
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
PCKEVOD_W2_SP(x5, x4, x2r, x2i);
PCKEVOD_W2_SP(y5, y4, y2r, y2i);
PCKEVOD_W2_SP(x7, x6, x3r, x3i);
PCKEVOD_W2_SP(y7, y6, y3r, y3i);
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;
ILVRL_W2_SP(y0i, y0r, y0, y1);
ILVRL_W2_SP(y1i, y1r, y2, y3);
ILVRL_W2_SP(y2i, y2r, y4, y5);
ILVRL_W2_SP(y3i, y3r, y6, y7);
ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
}
if (n & 15)
{
if (n & 8)
{
LD_SP4_INC(x, 4, x0, x1, x2, x3);
LD_SP4_INC(py, 4, y0, y1, y2, y3);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
ILVRL_W2_SP(y0i, y0r, y0, y1);
ILVRL_W2_SP(y1i, y1r, y2, y3);
ST_SP4_INC(y0, y1, y2, y3, y, 4);
}
if (n & 4)
{
LD_SP2_INC(x, 4, x0, x1);
LD_SP2_INC(py, 4, y0, y1);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;
ILVRL_W2_SP(y0i, y0r, y0, y1);
ST_SP2_INC(y0, y1, y, 4);
}
if (n & 2)
{
LD_GP4_INC(x, 1, xd0, xd1, xd2, xd3);
LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3);
FMADD2(xd0, xd2, da_r, yd0, yd2);
yd1 OP0 da_r * xd1;
yd3 OP0 da_r * xd3;
yd0 OP1 da_i * xd1;
yd2 OP1 da_i * xd3;
yd1 OP2 da_i * xd0;
yd3 OP2 da_i * xd2;
ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1);
}
if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);
yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;
ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else if (1 == inc_y)
{
FLOAT *y_pref;
BLASLONG pref_offset;
v4f32 x8, x9, x10, x11, x12, x13, x14;
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 64;
inc_x2 = 2 * inc_x;
dar_vec = COPY_FLOAT_TO_VECTOR(da_r);
dai_vec = COPY_FLOAT_TO_VECTOR(da_i);
for (i = (n >> 4); i--;)
{
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
y_pref += 32;
LD_SP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x14);
LD_SP7_INC(x, inc_x2, x8, x9, x10, x11, x12, x13, x7);
PCKEV_D2_SP(x1, x0, x3, x2, x0, x1);
PCKEV_D2_SP(x5, x4, x14, x6, x2, x3);
PCKEV_D2_SP(x9, x8, x11, x10, x4, x5);
x6 = (v4f32) __msa_pckev_d((v2i64) x13, (v2i64) x12);
x7 = (v4f32) __msa_insert_w((v4i32) x7, 2, *((int *) x));
x7 = (v4f32) __msa_insert_w((v4i32) x7, 3, *((int *) (x + 1)));
x += inc_x2;
LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
PCKEVOD_W2_SP(x5, x4, x2r, x2i);
PCKEVOD_W2_SP(y5, y4, y2r, y2i);
PCKEVOD_W2_SP(x7, x6, x3r, x3i);
PCKEVOD_W2_SP(y7, y6, y3r, y3i);
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;
ILVRL_W2_SP(y0i, y0r, y0, y1);
ILVRL_W2_SP(y1i, y1r, y2, y3);
ILVRL_W2_SP(y2i, y2r, y4, y5);
ILVRL_W2_SP(y3i, y3r, y6, y7);
ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
}
if (n & 15)
{
if (n & 8)
{
LD_SP7_INC(x, inc_x2, x0, x1, x2, x6, x4, x5, x3);
PCKEV_D2_SP(x1, x0, x6, x2, x0, x1);
x2 = (v4f32) __msa_pckev_d((v2i64) x5, (v2i64) x4);
x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x));
x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) (x + 1)));
x += inc_x2;
LD_SP4_INC(py, 4, y0, y1, y2, y3);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
ILVRL_W2_SP(y0i, y0r, y0, y1);
ILVRL_W2_SP(y1i, y1r, y2, y3);
ST_SP4_INC(y0, y1, y2, y3, y, 4);
}
if (n & 4)
{
LD_SP3_INC(x, inc_x2, x0, x2, x1);
x0 = (v4f32) __msa_pckev_d((v2i64) x2, (v2i64) x0);
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) (x + 1)));
x += inc_x2;
LD_SP2_INC(py, 4, y0, y1);
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;
ILVRL_W2_SP(y0i, y0r, y0, y1);
ST_SP2_INC(y0, y1, y, 4);
}
if (n & 2)
{
xd0 = x[0];
xd1 = x[1];
x += inc_x2;
xd2 = x[0];
xd3 = x[1];
x += inc_x2;
LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3);
FMADD2(xd0, xd2, da_r, yd0, yd2);
yd1 OP0 da_r * xd1;
yd3 OP0 da_r * xd3;
yd0 OP1 da_i * xd1;
yd2 OP1 da_i * xd3;
yd1 OP2 da_i * xd0;
yd3 OP2 da_i * xd2;
ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1);
}
if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);
yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;
ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (i = (n >> 2); i--;)
{
xd0 = x[0];
xd1 = x[1];
x += inc_x2;
xd2 = x[0];
xd3 = x[1];
x += inc_x2;
xd4 = x[0];
xd5 = x[1];
x += inc_x2;
xd6 = x[0];
xd7 = x[1];
x += inc_x2;
yd0 = py[0];
yd1 = py[1];
py += inc_y2;
yd2 = py[0];
yd3 = py[1];
py += inc_y2;
yd4 = py[0];
yd5 = py[1];
py += inc_y2;
yd6 = py[0];
yd7 = py[1];
py += inc_y2;
FMADD4(xd0, xd2, xd4, xd6, da_r, yd0, yd2, yd4, yd6);
yd1 OP0 da_r * xd1;
yd3 OP0 da_r * xd3;
yd5 OP0 da_r * xd5;
yd7 OP0 da_r * xd7;
yd0 OP1 da_i * xd1;
yd2 OP1 da_i * xd3;
yd4 OP1 da_i * xd5;
yd6 OP1 da_i * xd7;
yd1 OP2 da_i * xd0;
yd3 OP2 da_i * xd2;
yd5 OP2 da_i * xd4;
yd7 OP2 da_i * xd6;
y[0] = yd0;
y[1] = yd1;
y += inc_y2;
y[0] = yd2;
y[1] = yd3;
y += inc_y2;
y[0] = yd4;
y[1] = yd5;
y += inc_y2;
y[0] = yd6;
y[1] = yd7;
y += inc_y2;
}
if (n & 3)
{
if (n & 2)
{
xd0 = x[0];
xd1 = x[1];
x += inc_x2;
xd2 = x[0];
xd3 = x[1];
x += inc_x2;
yd0 = py[0];
yd1 = py[1];
py += inc_y2;
yd2 = py[0];
yd3 = py[1];
py += inc_y2;
FMADD2(xd0, xd2, da_r, yd0, yd2);
yd1 OP0 da_r * xd1;
yd3 OP0 da_r * xd3;
yd0 OP1 da_i * xd1;
yd2 OP1 da_i * xd3;
yd1 OP2 da_i * xd0;
yd3 OP2 da_i * xd2;
y[0] = yd0;
y[1] = yd1;
y += inc_y2;
y[0] = yd2;
y[1] = yd3;
y += inc_y2;
}
if (n & 1)
{
xd0 = x[0];
xd1 = x[1];
yd0 = y[0];
yd1 = y[1];
yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;
y[0] = yd0;
y[1] = yd1;
}
}
}
return (0);
}

201
kernel/mips/ccopy_msa.c Normal file
View File

@ -0,0 +1,201 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i, inc_x2, inc_y2;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
if (n < 0) return (0);
if ((1 == inc_x) && (1 == inc_y))
{
if (n > 31)
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 128 + 32;
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 5) - 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 64;
x8 = LD_SP(x); x += 4;
ST_SP(x0, y); y += 4;
x9 = LD_SP(x); x += 4;
ST_SP(x1, y); y += 4;
x10 = LD_SP(x); x += 4;
ST_SP(x2, y); y += 4;
x11 = LD_SP(x); x += 4;
ST_SP(x3, y); y += 4;
x12 = LD_SP(x); x += 4;
ST_SP(x4, y); y += 4;
x13 = LD_SP(x); x += 4;
ST_SP(x5, y); y += 4;
x14 = LD_SP(x); x += 4;
ST_SP(x6, y); y += 4;
x15 = LD_SP(x); x += 4;
ST_SP(x7, y); y += 4;
x0 = LD_SP(x); x += 4;
ST_SP(x8, y); y += 4;
x1 = LD_SP(x); x += 4;
ST_SP(x9, y); y += 4;
x2 = LD_SP(x); x += 4;
ST_SP(x10, y); y += 4;
x3 = LD_SP(x); x += 4;
ST_SP(x11, y); y += 4;
x4 = LD_SP(x); x += 4;
ST_SP(x12, y); y += 4;
x5 = LD_SP(x); x += 4;
ST_SP(x13, y); y += 4;
x6 = LD_SP(x); x += 4;
ST_SP(x14, y); y += 4;
x7 = LD_SP(x); x += 4;
ST_SP(x15, y); y += 4;
}
x8 = LD_SP(x); x += 4;
x9 = LD_SP(x); x += 4;
ST_SP(x0, y); y += 4;
x10 = LD_SP(x); x += 4;
ST_SP(x1, y); y += 4;
x11 = LD_SP(x); x += 4;
ST_SP(x2, y); y += 4;
x12 = LD_SP(x); x += 4;
ST_SP(x3, y); y += 4;
x13 = LD_SP(x); x += 4;
ST_SP(x4, y); y += 4;
x14 = LD_SP(x); x += 4;
ST_SP(x5, y); y += 4;
x15 = LD_SP(x); x += 4;
ST_SP(x6, y); y += 4;
ST_SP(x7, y); y += 4;
ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4);
}
if (n & 31)
{
if (n & 16)
{
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4);
}
if (n & 8)
{
LD_SP4_INC(x, 4, x0, x1, x2, x3);
ST_SP4_INC(x0, x1, x2, x3, y, 4);
}
if (n & 4)
{
LD_SP2_INC(x, 4, x0, x1);
ST_SP2_INC(x0, x1, y, 4);
}
if (n & 2)
{
LD_GP4_INC(x, 1, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, 1);
}
if (n & 1)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (i = (n >> 2); i--;)
{
f0 = *x;
f1 = *(x+1); x += inc_x2;
f2 = *x;
f3 = *(x+1); x += inc_x2;
f4 = *x;
f5 = *(x+1); x += inc_x2;
f6 = *x;
f7 = *(x+1); x += inc_x2;
*y = f0;
*(y+1) = f1; y += inc_y2;
*y = f2;
*(y+1) = f3; y += inc_y2;
*y = f4;
*(y+1) = f5; y += inc_y2;
*y = f6;
*(y+1) = f7; y += inc_y2;
}
if (n & 2)
{
f0 = *x;
f1 = *(x+1); x += inc_x2;
f2 = *x;
f3 = *(x+1); x += inc_x2;
*y = f0;
*(y+1) = f1; y += inc_y2;
*y = f2;
*(y+1) = f3; y += inc_y2;
}
if (n & 1)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
}
return (0);
}

1012
kernel/mips/cscal_msa.c Normal file

File diff suppressed because it is too large Load Diff

281
kernel/mips/cswap_msa.c Normal file
View File

@ -0,0 +1,281 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy,
BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i = 0, pref_offsetx, pref_offsety;
FLOAT *px, *py;
BLASLONG inc_x2, inc_y2;
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
if (n < 0) return (0);
pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
if (pref_offsetx > 0)
{
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
pref_offsetx = pref_offsetx / sizeof(FLOAT);
}
pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
if (pref_offsety > 0)
{
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
pref_offsety = pref_offsety / sizeof(FLOAT);
}
px = srcx;
py = srcy;
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
if ((1 == inc_x) && (1 == inc_y))
{
if (n >> 4)
{
LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
for (i = (n >> 4) - 1; i--;)
{
PREFETCH(px + pref_offsetx + 32);
PREFETCH(px + pref_offsetx + 40);
PREFETCH(px + pref_offsetx + 48);
PREFETCH(px + pref_offsetx + 56);
PREFETCH(py + pref_offsety + 32);
PREFETCH(py + pref_offsety + 40);
PREFETCH(py + pref_offsety + 48);
PREFETCH(py + pref_offsety + 56);
yv0 = LD_SP(py); py += 4;
ST_SP(xv0, srcy); srcy += 4;
yv1 = LD_SP(py); py += 4;
ST_SP(xv1, srcy); srcy += 4;
yv2 = LD_SP(py); py += 4;
ST_SP(xv2, srcy); srcy += 4;
yv3 = LD_SP(py); py += 4;
ST_SP(xv3, srcy); srcy += 4;
yv4 = LD_SP(py); py += 4;
ST_SP(xv4, srcy); srcy += 4;
yv5 = LD_SP(py); py += 4;
ST_SP(xv5, srcy); srcy += 4;
yv6 = LD_SP(py); py += 4;
ST_SP(xv6, srcy); srcy += 4;
yv7 = LD_SP(py); py += 4;
ST_SP(xv7, srcy); srcy += 4;
xv0 = LD_SP(px); px += 4;
ST_SP(yv0, srcx); srcx += 4;
xv1 = LD_SP(px); px += 4;
ST_SP(yv1, srcx); srcx += 4;
xv2 = LD_SP(px); px += 4;
ST_SP(yv2, srcx); srcx += 4;
xv3 = LD_SP(px); px += 4;
ST_SP(yv3, srcx); srcx += 4;
xv4 = LD_SP(px); px += 4;
ST_SP(yv4, srcx); srcx += 4;
xv5 = LD_SP(px); px += 4;
ST_SP(yv5, srcx); srcx += 4;
xv6 = LD_SP(px); px += 4;
ST_SP(yv6, srcx); srcx += 4;
xv7 = LD_SP(px); px += 4;
ST_SP(yv7, srcx); srcx += 4;
}
LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4);
ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4);
}
if (n & 15)
{
if ((n & 8) && (n & 4) && (n & 2))
{
LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4);
ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4);
}
else if ((n & 8) && (n & 4))
{
LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5);
LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5);
ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4);
ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4);
}
else if ((n & 8) && (n & 2))
{
LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4);
LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4);
ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4);
ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4);
}
else if ((n & 4) && (n & 2))
{
LD_SP3_INC(px, 4, xv0, xv1, xv2);
LD_SP3_INC(py, 4, yv0, yv1, yv2);
ST_SP3_INC(xv0, xv1, xv2, srcy, 4);
ST_SP3_INC(yv0, yv1, yv2, srcx, 4);
}
else if (n & 8)
{
LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3);
LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3);
ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4);
ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4);
}
else if (n & 4)
{
LD_SP2_INC(px, 4, xv0, xv1);
LD_SP2_INC(py, 4, yv0, yv1);
ST_SP2_INC(xv0, xv1, srcy, 4);
ST_SP2_INC(yv0, yv1, srcx, 4);
}
else if (n & 2)
{
xv0 = LD_SP(px);
yv0 = LD_SP(py);
px += 4;
py += 4;
ST_SP(xv0, srcy);
ST_SP(yv0, srcx);
srcx += 4;
srcy += 4;
}
if (n & 1)
{
LD_GP2_INC(px, 1, x0, x1);
LD_GP2_INC(py, 1, y0, y1);
ST_GP2_INC(x0, x1, srcy, 1);
ST_GP2_INC(y0, y1, srcx, 1);
}
}
}
else
{
for (i = (n >> 2); i--;)
{
x0 = srcx[0 * inc_x2];
x1 = srcx[0 * inc_x2 + 1];
x2 = srcx[1 * inc_x2];
x3 = srcx[1 * inc_x2 + 1];
x4 = srcx[2 * inc_x2];
x5 = srcx[2 * inc_x2 + 1];
x6 = srcx[3 * inc_x2];
x7 = srcx[3 * inc_x2 + 1];
y0 = srcy[0 * inc_y2];
y1 = srcy[0 * inc_y2 + 1];
y2 = srcy[1 * inc_y2];
y3 = srcy[1 * inc_y2 + 1];
y4 = srcy[2 * inc_y2];
y5 = srcy[2 * inc_y2 + 1];
y6 = srcy[3 * inc_y2];
y7 = srcy[3 * inc_y2 + 1];
srcx[0 * inc_x2] = y0;
srcx[0 * inc_x2 + 1] = y1;
srcx[1 * inc_x2] = y2;
srcx[1 * inc_x2 + 1] = y3;
srcx[2 * inc_x2] = y4;
srcx[2 * inc_x2 + 1] = y5;
srcx[3 * inc_x2] = y6;
srcx[3 * inc_x2 + 1] = y7;
srcy[0 * inc_y2] = x0;
srcy[0 * inc_y2 + 1] = x1;
srcy[1 * inc_y2] = x2;
srcy[1 * inc_y2 + 1] = x3;
srcy[2 * inc_y2] = x4;
srcy[2 * inc_y2 + 1] = x5;
srcy[3 * inc_y2] = x6;
srcy[3 * inc_y2 + 1] = x7;
srcx += 4 * inc_x2;
srcy += 4 * inc_y2;
}
if (n & 2)
{
x0 = srcx[0 * inc_x2];
x1 = srcx[0 * inc_x2 + 1];
x2 = srcx[1 * inc_x2];
x3 = srcx[1 * inc_x2 + 1];
y0 = srcy[0 * inc_y2];
y1 = srcy[0 * inc_y2 + 1];
y2 = srcy[1 * inc_y2];
y3 = srcy[1 * inc_y2 + 1];
srcx[0 * inc_x2] = y0;
srcx[0 * inc_x2 + 1] = y1;
srcx[1 * inc_x2] = y2;
srcx[1 * inc_x2 + 1] = y3;
srcy[0 * inc_y2] = x0;
srcy[0 * inc_y2 + 1] = x1;
srcy[1 * inc_y2] = x2;
srcy[1 * inc_y2 + 1] = x3;
srcx += 2 * inc_x2;
srcy += 2 * inc_y2;
}
if (n & 1)
{
x0 = srcx[0 * inc_x2];
x1 = srcx[0 * inc_x2 + 1];
y0 = srcy[0 * inc_y2];
y1 = srcy[0 * inc_y2 + 1];
srcx[0 * inc_x2] = y0;
srcx[0 * inc_x2 + 1] = y1;
srcy[0 * inc_y2] = x0;
srcy[0 * inc_y2 + 1] = x1;
srcx += inc_x2;
srcy += inc_y2;
}
}
return (0);
}

246
kernel/mips/daxpy_msa.c Normal file
View File

@ -0,0 +1,246 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
#if !defined(CONJ)
#define OP0 +=
#define OP1 -=
#define OP2 +=
#else
#define OP0 -=
#define OP1 +=
#define OP2 -=
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i;
FLOAT *py;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
v2f64 da_vec, zero_v = {0};
if ((n < 0) || (da == 0.0)) return(0);
py = y;
if ((1 == inc_x) && (1 == inc_y))
{
FLOAT *x_pref, *y_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32;
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 32;
da_vec = COPY_DOUBLE_TO_VECTOR(da);
for (i = (n >> 4); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
x_pref += 16;
y_pref += 16;
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
}
if (n & 15)
{
if (n & 8)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
LD_DP4_INC(py, 2, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, 2);
}
if (n & 4)
{
LD_DP2_INC(x, 2, x0, x1);
LD_DP2_INC(py, 2, y0, y1);
FMADD2(x0, x1, da_vec, y0, y1);
ST_DP2_INC(y0, y1, y, 2);
}
if (n & 2)
{
x0 = LD_DP(x); x += 2;
y0 = LD_DP(py); py += 2;
y0 += da_vec * x0;
ST_DP(y0, y); y += 2;
}
if (n & 1)
{
y[0] += da * x[0];
}
}
}
else if (1 == inc_y)
{
FLOAT *y_pref;
BLASLONG pref_offset;
v2f64 x8, x9, x10, x11, x12, x13, x14;
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 32;
da_vec = COPY_DOUBLE_TO_VECTOR(da);
for (i = (n >> 4); i--;)
{
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
y_pref += 16;
LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x14);
LD_DP7_INC(x, inc_x, x8, x9, x10, x11, x12, x13, x7);
PCKEV_D2_SD(x1, x0, x3, x2, x0, x1);
PCKEV_D2_SD(x5, x4, x14, x6, x2, x3);
PCKEV_D2_SD(x9, x8, x11, x10, x4, x5);
x6 = (v2f64) __msa_pckev_d((v2i64) x13, (v2i64) x12);
x7 = (v2f64) __msa_insert_d((v2i64) x7, 1, *((BLASLONG *) x));
x += inc_x;
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
}
if (n & 15)
{
if (n & 8)
{
LD_DP7_INC(x, inc_x, x0, x1, x2, x6, x4, x5, x3);
PCKEV_D2_SD(x1, x0, x6, x2, x0, x1);
x2 = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);
x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((BLASLONG *) x));
x += inc_x;
LD_DP4_INC(py, 2, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, 2);
}
if (n & 4)
{
LD_DP3_INC(x, inc_x, x0, x2, x1);
x0 = (v2f64) __msa_pckev_d((v2i64) x2, (v2i64) x0);
x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((BLASLONG *) x));
x += inc_x;
LD_DP2_INC(py, 2, y0, y1);
FMADD2(x0, x1, da_vec, y0, y1);
ST_DP2_INC(y0, y1, y, 2);
}
if (n & 2)
{
x0 = (v2f64) __msa_insert_d((v2i64) zero_v, 0, *((BLASLONG *) x));
x += inc_x;
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((BLASLONG *) x));
x += inc_x;
y0 = LD_DP(py); py += 2;
y0 += da_vec * x0;
ST_DP(y0, y); y += 2;
}
if (n & 1)
{
y[0] += da * x[0];
}
}
}
else
{
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
for (i = (n >> 2); i--;)
{
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3);
ST_GP4_INC(y0, y1, y2, y3, y, inc_y);
}
if (n & 3)
{
if (n & 2)
{
LD_GP2_INC(x, inc_x, x0, x1);
LD_GP2_INC(py, inc_y, y0, y1);
FMADD2(x0, x1, da, y0, y1);
ST_GP2_INC(y0, y1, y, inc_y);
}
if (n & 1)
{
*y += da * *x;
}
}
}
return (0);
}

180
kernel/mips/dcopy_msa.c Normal file
View File

@ -0,0 +1,180 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
if (n < 0) return (0);
if ((1 == inc_x) && (1 == inc_y))
{
if (n > 31)
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64 + 16;
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 5) - 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;
x8 = LD_DP(x); x += 2;
ST_DP(x0, y); y += 2;
x9 = LD_DP(x); x += 2;
ST_DP(x1, y); y += 2;
x10 = LD_DP(x); x += 2;
ST_DP(x2, y); y += 2;
x11 = LD_DP(x); x += 2;
ST_DP(x3, y); y += 2;
x12 = LD_DP(x); x += 2;
ST_DP(x4, y); y += 2;
x13 = LD_DP(x); x += 2;
ST_DP(x5, y); y += 2;
x14 = LD_DP(x); x += 2;
ST_DP(x6, y); y += 2;
x15 = LD_DP(x); x += 2;
ST_DP(x7, y); y += 2;
x0 = LD_DP(x); x += 2;
ST_DP(x8, y); y += 2;
x1 = LD_DP(x); x += 2;
ST_DP(x9, y); y += 2;
x2 = LD_DP(x); x += 2;
ST_DP(x10, y); y += 2;
x3 = LD_DP(x); x += 2;
ST_DP(x11, y); y += 2;
x4 = LD_DP(x); x += 2;
ST_DP(x12, y); y += 2;
x5 = LD_DP(x); x += 2;
ST_DP(x13, y); y += 2;
x6 = LD_DP(x); x += 2;
ST_DP(x14, y); y += 2;
x7 = LD_DP(x); x += 2;
ST_DP(x15, y); y += 2;
}
x8 = LD_DP(x); x += 2;
x9 = LD_DP(x); x += 2;
ST_DP(x0, y); y += 2;
x10 = LD_DP(x); x += 2;
ST_DP(x1, y); y += 2;
x11 = LD_DP(x); x += 2;
ST_DP(x2, y); y += 2;
x12 = LD_DP(x); x += 2;
ST_DP(x3, y); y += 2;
x13 = LD_DP(x); x += 2;
ST_DP(x4, y); y += 2;
x14 = LD_DP(x); x += 2;
ST_DP(x5, y); y += 2;
x15 = LD_DP(x); x += 2;
ST_DP(x6, y); y += 2;
ST_DP(x7, y); y += 2;
ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2);
}
if (n & 31)
{
if (n & 16)
{
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2);
}
if (n & 8)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, y, 2);
}
if (n & 4)
{
LD_GP4_INC(x, 1, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, 1);
}
if (n & 2)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
if (n & 1)
{
*y = *x;
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y);
}
if (n & 4)
{
LD_GP4_INC(x, inc_x, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, inc_y);
}
if (n & 2)
{
LD_GP2_INC(x, inc_x, f0, f1);
ST_GP2_INC(f0, f1, y, inc_y);
}
if (n & 1)
{
*y = *x;
}
}
return (0);
}

368
kernel/mips/dscal_msa.c Normal file
View File

@ -0,0 +1,368 @@
/*******************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i;
FLOAT *px;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
v2f64 da_vec;
px = x;
if (1 == inc_x)
{
if (0.0 == da)
{
v2f64 zero_v = __msa_cast_to_vector_double(0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
for (i = (n >> 5); i--;)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
}
if (n & 31)
{
if (n & 16)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
}
if (n & 8)
{
ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
}
if (n & 4)
{
ST_DP2_INC(zero_v, zero_v, x, 2);
}
if (n & 2)
{
*x = 0; x += 1;
*x = 0; x += 1;
}
if (n & 1)
{
*x = 0;
}
}
}
else
{
da_vec = COPY_DOUBLE_TO_VECTOR(da);
if (n > 31)
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32 + 16;
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = 0; i < (n >> 5) - 1; i++)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;
x8 = LD_DP(px); px += 2;
x0 *= da_vec;
x9 = LD_DP(px); px += 2;
x1 *= da_vec;
x10 = LD_DP(px); px += 2;
x2 *= da_vec;
x11 = LD_DP(px); px += 2;
x3 *= da_vec;
x12 = LD_DP(px); px += 2;
x4 *= da_vec;
x13 = LD_DP(px); px += 2;
x5 *= da_vec;
x14 = LD_DP(px); px += 2;
x6 *= da_vec;
x15 = LD_DP(px); px += 2;
x7 *= da_vec;
x8 *= da_vec;
ST_DP(x0, x); x += 2;
x9 *= da_vec;
ST_DP(x1, x); x += 2;
x10 *= da_vec;
ST_DP(x2, x); x += 2;
x11 *= da_vec;
ST_DP(x3, x); x += 2;
x12 *= da_vec;
ST_DP(x4, x); x += 2;
x13 *= da_vec;
ST_DP(x5, x); x += 2;
x14 *= da_vec;
ST_DP(x6, x); x += 2;
x15 *= da_vec;
ST_DP(x7, x); x += 2;
ST_DP(x8, x); x += 2;
x0 = LD_DP(px); px += 2;
ST_DP(x9, x); x += 2;
x1 = LD_DP(px); px += 2;
ST_DP(x10, x); x += 2;
x2 = LD_DP(px); px += 2;
ST_DP(x11, x); x += 2;
x3 = LD_DP(px); px += 2;
ST_DP(x12, x); x += 2;
x4 = LD_DP(px); px += 2;
ST_DP(x13, x); x += 2;
x5 = LD_DP(px); px += 2;
ST_DP(x14, x); x += 2;
x6 = LD_DP(px); px += 2;
ST_DP(x15, x); x += 2;
x7 = LD_DP(px); px += 2;
}
x8 = LD_DP(px); px += 2;
x0 *= da_vec;
x9 = LD_DP(px); px += 2;
x1 *= da_vec;
x10 = LD_DP(px); px += 2;
x2 *= da_vec;
x11 = LD_DP(px); px += 2;
x3 *= da_vec;
x12 = LD_DP(px); px += 2;
x4 *= da_vec;
x13 = LD_DP(px); px += 2;
x5 *= da_vec;
x14 = LD_DP(px); px += 2;
x6 *= da_vec;
x15 = LD_DP(px); px += 2;
x7 *= da_vec;
x8 *= da_vec;
ST_DP(x0, x); x += 2;
x9 *= da_vec;
ST_DP(x1, x); x += 2;
x10 *= da_vec;
ST_DP(x2, x); x += 2;
x11 *= da_vec;
ST_DP(x3, x); x += 2;
x12 *= da_vec;
ST_DP(x4, x); x += 2;
x13 *= da_vec;
ST_DP(x5, x); x += 2;
x15 *= da_vec;
ST_DP(x6, x); x += 2;
x14 *= da_vec;
ST_DP(x7, x); x += 2;
ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 2);
}
if (n & 31)
{
if (n & 16)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
}
if (n & 8)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, x, 2);
}
if (n & 4)
{
LD_DP2_INC(px, 2, x0, x1);
MUL2(x0, da_vec, x1, da_vec, x0, x1);
ST_DP2_INC(x0, x1, x, 2);
}
if (n & 2)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da, f1, da, f0, f1);
ST_GP2_INC(f0, f1, x, 1);
}
if (n & 1)
{
*x *= da;
}
}
}
}
else
{
if (da == 0.0)
{
for (i = n; i--;)
{
*x = 0.0;
x += inc_x;
}
}
else
{
if (n > 15)
{
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
for (i = 0; i < (n >> 4) - 1; i++)
{
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
f4 *= da;
f5 *= da;
*x = f0; x += inc_x;
f6 *= da;
*x = f1; x += inc_x;
f7 *= da;
*x = f2; x += inc_x;
f8 *= da;
*x = f3; x += inc_x;
f9 *= da;
*x = f4; x += inc_x;
f10 *= da;
*x = f5; x += inc_x;
f11 *= da;
*x = f6; x += inc_x;
f12 *= da;
*x = f7; x += inc_x;
f13 *= da;
*x = f8; x += inc_x;
f14 *= da;
*x = f9; x += inc_x;
f15 *= da;
*x = f10; x += inc_x;
*x = f11; x += inc_x;
f0 = *px; px += inc_x;
*x = f12; x += inc_x;
f1 = *px; px += inc_x;
*x = f13; x += inc_x;
f2 = *px; px += inc_x;
*x = f14; x += inc_x;
f3 = *px; px += inc_x;
*x = f15; x += inc_x;
f4 = *px; px += inc_x;
f5 = *px; px += inc_x;
f6 = *px; px += inc_x;
f7 = *px; px += inc_x;
}
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
f4 *= da;
f5 *= da;
*x = f0; x += inc_x;
f6 *= da;
*x = f1; x += inc_x;
f7 *= da;
*x = f2; x += inc_x;
f8 *= da;
*x = f3; x += inc_x;
f9 *= da;
*x = f4; x += inc_x;
f10 *= da;
*x = f5; x += inc_x;
f11 *= da;
*x = f6; x += inc_x;
f12 *= da;
*x = f7; x += inc_x;
f13 *= da;
*x = f8; x += inc_x;
f14 *= da;
*x = f9; x += inc_x;
f15 *= da;
*x = f10; x += inc_x;
*x = f11; x += inc_x;
*x = f12; x += inc_x;
*x = f13; x += inc_x;
*x = f14; x += inc_x;
*x = f15; x += inc_x;
}
if (n & 15)
{
if (n & 8)
{
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
}
if (n & 4)
{
LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
}
if (n & 2)
{
LD_GP2_INC(px, inc_x, f0, f1);
MUL2(f0, da, f1, da, f0, f1);
ST_GP2_INC(f0, f1, x, inc_x);
}
if (n & 1)
{
*x *= da;
}
}
}
}
return 0;
}

253
kernel/mips/dswap_msa.c Normal file
View File

@ -0,0 +1,253 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i = 0, pref_offsetx, pref_offsety;
FLOAT *px, *py;
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
if (n < 0) return (0);
pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
if (pref_offsetx > 0)
{
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
pref_offsetx = pref_offsetx / sizeof(FLOAT);
}
pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
if (pref_offsety > 0)
{
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
pref_offsety = pref_offsety / sizeof(FLOAT);
}
px = srcx;
py = srcy;
if ((1 == inc_x) && (1 == inc_y))
{
if (n >> 4)
{
LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
for (i = (n >> 4) - 1; i--;)
{
PREFETCH(px + pref_offsetx + 16);
PREFETCH(px + pref_offsetx + 20);
PREFETCH(px + pref_offsetx + 24);
PREFETCH(px + pref_offsetx + 28);
PREFETCH(py + pref_offsety + 16);
PREFETCH(py + pref_offsety + 20);
PREFETCH(py + pref_offsety + 24);
PREFETCH(py + pref_offsety + 28);
yv0 = LD_DP(py); py += 2;
ST_DP(xv0, srcy); srcy += 2;
yv1 = LD_DP(py); py += 2;
ST_DP(xv1, srcy); srcy += 2;
yv2 = LD_DP(py); py += 2;
ST_DP(xv2, srcy); srcy += 2;
yv3 = LD_DP(py); py += 2;
ST_DP(xv3, srcy); srcy += 2;
yv4 = LD_DP(py); py += 2;
ST_DP(xv4, srcy); srcy += 2;
yv5 = LD_DP(py); py += 2;
ST_DP(xv5, srcy); srcy += 2;
yv6 = LD_DP(py); py += 2;
ST_DP(xv6, srcy); srcy += 2;
yv7 = LD_DP(py); py += 2;
ST_DP(xv7, srcy); srcy += 2;
xv0 = LD_DP(px); px += 2;
ST_DP(yv0, srcx); srcx += 2;
xv1 = LD_DP(px); px += 2;
ST_DP(yv1, srcx); srcx += 2;
xv2 = LD_DP(px); px += 2;
ST_DP(yv2, srcx); srcx += 2;
xv3 = LD_DP(px); px += 2;
ST_DP(yv3, srcx); srcx += 2;
xv4 = LD_DP(px); px += 2;
ST_DP(yv4, srcx); srcx += 2;
xv5 = LD_DP(px); px += 2;
ST_DP(yv5, srcx); srcx += 2;
xv6 = LD_DP(px); px += 2;
ST_DP(yv6, srcx); srcx += 2;
xv7 = LD_DP(px); px += 2;
ST_DP(yv7, srcx); srcx += 2;
}
LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2);
ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2);
}
if (n & 15)
{
if ((n & 8) && (n & 4) && (n & 2))
{
LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2);
ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2);
}
else if ((n & 8) && (n & 4))
{
LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5);
LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5);
ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2);
ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2);
}
else if ((n & 8) && (n & 2))
{
LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4);
LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4);
ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2);
ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2);
}
else if ((n & 4) && (n & 2))
{
LD_DP3_INC(px, 2, xv0, xv1, xv2);
LD_DP3_INC(py, 2, yv0, yv1, yv2);
ST_DP3_INC(xv0, xv1, xv2, srcy, 2);
ST_DP3_INC(yv0, yv1, yv2, srcx, 2);
}
else if (n & 8)
{
LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3);
LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3);
ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2);
ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2);
}
else if (n & 4)
{
LD_DP2_INC(px, 2, xv0, xv1);
LD_DP2_INC(py, 2, yv0, yv1);
ST_DP2_INC(xv0, xv1, srcy, 2);
ST_DP2_INC(yv0, yv1, srcx, 2);
}
else if (n & 2)
{
xv0 = LD_DP(px);
yv0 = LD_DP(py);
px += 2;
py += 2;
ST_DP(xv0, srcy);
ST_DP(yv0, srcx);
srcx += 2;
srcy += 2;
}
if (n & 1)
{
x0 = px[0];
y0 = py[0];
srcx[0] = y0;
srcy[0] = x0;
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
}
if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
}
else if ((n & 4) && (n & 2))
{
LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
}
else if ((n & 4) && (n & 1))
{
LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
}
else if ((n & 2) && (n & 1))
{
LD_GP3_INC(px, inc_x, x0, x1, x2);
LD_GP3_INC(py, inc_y, y0, y1, y2);
ST_GP3_INC(x0, x1, x2, srcy, inc_y);
ST_GP3_INC(y0, y1, y2, srcx, inc_x);
}
else if (n & 4)
{
LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
}
else if (n & 2)
{
LD_GP2_INC(px, inc_x, x0, x1);
LD_GP2_INC(py, inc_y, y0, y1);
ST_GP2_INC(x0, x1, srcy, inc_y);
ST_GP2_INC(y0, y1, srcx, inc_x);
}
else if (n & 1)
{
x0 = *srcx;
y0 = *srcy;
*srcx = y0;
*srcy = x0;
}
}
}
return (0);
}

View File

@ -722,6 +722,31 @@ inline static void prefetch_load_lf(unsigned char *src)
MUL2(in4, in5, in6, in7, out2, out3); \
}
/* Description : Multiplication of pairs of vectors and added in output
Arguments : Inputs - in0, in1, vec, out0, out1
Outputs - out0, out1
Details : Each element from 'in0' is multiplied with elements from 'vec'
and the result is added to 'out0'
*/
#define FMADD2(in0, in1, vec, inout0, inout1) \
{ \
inout0 += in0 * vec; \
inout1 += in1 * vec; \
}
#define FMADD3(in0, in1, in2, vec, \
inout0, inout1, inout2) \
{ \
inout0 += in0 * vec; \
inout1 += in1 * vec; \
inout2 += in2 * vec; \
}
#define FMADD4(in0, in1, in2, in3, vec, \
inout0, inout1, inout2, inout3) \
{ \
FMADD2(in0, in1, vec, inout0, inout1); \
FMADD2(in2, in3, vec, inout2, inout3); \
}
/* Description : Addition of 2 pairs of variables
Arguments : Inputs - in0, in1, in2, in3
Outputs - out0, out1

265
kernel/mips/saxpy_msa.c Normal file
View File

@ -0,0 +1,265 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
#if !defined(CONJ)
#define OP0 +=
#define OP1 -=
#define OP2 +=
#else
#define OP0 -=
#define OP1 +=
#define OP2 -=
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i;
FLOAT *py;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
v4f32 da_vec, zero_v = {0};
if ((n < 0) || (da == 0.0)) return(0);
py = y;
if ((1 == inc_x) && (1 == inc_y))
{
FLOAT *x_pref, *y_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64;
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 64;
da_vec = COPY_FLOAT_TO_VECTOR(da);
for (i = (n >> 5); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
x_pref += 32;
y_pref += 32;
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
}
if (n & 31)
{
if (n & 16)
{
LD_SP4_INC(x, 4, x0, x1, x2, x3);
LD_SP4_INC(py, 4, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
ST_SP4_INC(y0, y1, y2, y3, y, 4);
}
if (n & 8)
{
LD_SP2_INC(x, 4, x0, x1);
LD_SP2_INC(py, 4, y0, y1);
FMADD2(x0, x1, da_vec, y0, y1);
ST_SP2_INC(y0, y1, y, 4);
}
if (n & 4)
{
x0 = LD_SP(x); x += 4;
y0 = LD_SP(py); py += 4;
y0 += da_vec * x0;
ST_SP(y0, y); y += 4;
}
if (n & 2)
{
FMADD2(x[0], x[1], da, y[0], y[1]);
x += 2;
y += 2;
}
if (n & 1)
{
y[0] += da * x[0];
}
}
}
else if (1 == inc_y)
{
da_vec = COPY_FLOAT_TO_VECTOR(da);
for (i = (n >> 4); i--;)
{
x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x));
x += inc_x;
x2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x2 = (v4f32) __msa_insert_w((v4i32) x2, 1, *((int *) x));
x += inc_x;
x2 = (v4f32) __msa_insert_w((v4i32) x2, 2, *((int *) x));
x += inc_x;
x2 = (v4f32) __msa_insert_w((v4i32) x2, 3, *((int *) x));
x += inc_x;
x3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x3 = (v4f32) __msa_insert_w((v4i32) x3, 1, *((int *) x));
x += inc_x;
x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x));
x += inc_x;
x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) x));
x += inc_x;
LD_SP4_INC(py, 4, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
ST_SP4_INC(y0, y1, y2, y3, y, 4);
}
if (n & 15)
{
if (n & 8)
{
x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
x += inc_x;
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x));
x += inc_x;
LD_SP2_INC(py, 4, y0, y1);
FMADD2(x0, x1, da_vec, y0, y1);
ST_SP2_INC(y0, y1, y, 4);
}
if (n & 4)
{
x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
x += inc_x;
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
x += inc_x;
y0 = LD_SP(py); py += 4;
y0 += da_vec * x0;
ST_SP(y0, y); y += 4;
}
if (n & 2)
{
FMADD2(x[0], x[inc_x], da, y[0], y[1]);
x += 2 * inc_x;
y += 2;
}
if (n & 1)
{
y[0] += da * x[0];
}
}
}
else
{
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
for (i = (n >> 2); i--;)
{
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3);
ST_GP4_INC(y0, y1, y2, y3, y, inc_y);
}
if (n & 3)
{
if (n & 2)
{
LD_GP2_INC(x, inc_x, x0, x1);
LD_GP2_INC(py, inc_y, y0, y1);
FMADD2(x0, x1, da, y0, y1);
ST_GP2_INC(y0, y1, y, inc_y);
}
if (n & 1)
{
*y += da * *x;
}
}
}
return (0);
}

186
kernel/mips/scopy_msa.c Normal file
View File

@ -0,0 +1,186 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
if (n < 0) return (0);
if ((1 == inc_x) && (1 == inc_y))
{
if (n > 63)
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 128 + 32;
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 6) - 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 64;
x8 = LD_SP(x); x += 4;
ST_SP(x0, y); y += 4;
x9 = LD_SP(x); x += 4;
ST_SP(x1, y); y += 4;
x10 = LD_SP(x); x += 4;
ST_SP(x2, y); y += 4;
x11 = LD_SP(x); x += 4;
ST_SP(x3, y); y += 4;
x12 = LD_SP(x); x += 4;
ST_SP(x4, y); y += 4;
x13 = LD_SP(x); x += 4;
ST_SP(x5, y); y += 4;
x14 = LD_SP(x); x += 4;
ST_SP(x6, y); y += 4;
x15 = LD_SP(x); x += 4;
ST_SP(x7, y); y += 4;
x0 = LD_SP(x); x += 4;
ST_SP(x8, y); y += 4;
x1 = LD_SP(x); x += 4;
ST_SP(x9, y); y += 4;
x2 = LD_SP(x); x += 4;
ST_SP(x10, y); y += 4;
x3 = LD_SP(x); x += 4;
ST_SP(x11, y); y += 4;
x4 = LD_SP(x); x += 4;
ST_SP(x12, y); y += 4;
x5 = LD_SP(x); x += 4;
ST_SP(x13, y); y += 4;
x6 = LD_SP(x); x += 4;
ST_SP(x14, y); y += 4;
x7 = LD_SP(x); x += 4;
ST_SP(x15, y); y += 4;
}
x8 = LD_SP(x); x += 4;
x9 = LD_SP(x); x += 4;
ST_SP(x0, y); y += 4;
x10 = LD_SP(x); x += 4;
ST_SP(x1, y); y += 4;
x11 = LD_SP(x); x += 4;
ST_SP(x2, y); y += 4;
x12 = LD_SP(x); x += 4;
ST_SP(x3, y); y += 4;
x13 = LD_SP(x); x += 4;
ST_SP(x4, y); y += 4;
x14 = LD_SP(x); x += 4;
ST_SP(x5, y); y += 4;
x15 = LD_SP(x); x += 4;
ST_SP(x6, y); y += 4;
ST_SP(x7, y); y += 4;
ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4);
}
if (n & 63)
{
if (n & 32)
{
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4);
}
if (n & 16)
{
LD_SP4_INC(x, 4, x0, x1, x2, x3);
ST_SP4_INC(x0, x1, x2, x3, y, 4);
}
if (n & 8)
{
LD_SP2_INC(x, 4, x0, x1);
ST_SP2_INC(x0, x1, y, 4);
}
if (n & 4)
{
LD_GP4_INC(x, 1, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, 1);
}
if (n & 2)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
if (n & 1)
{
*y = *x;
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y);
}
if (n & 4)
{
LD_GP4_INC(x, inc_x, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, y, inc_y);
}
if (n & 2)
{
LD_GP2_INC(x, inc_x, f0, f1);
ST_GP2_INC(f0, f1, y, inc_y);
}
if (n & 1)
{
*y = *x;
}
}
return (0);
}

385
kernel/mips/sscal_msa.c Normal file
View File

@ -0,0 +1,385 @@
/*******************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i;
FLOAT *px;
FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
v4f32 da_vec;
px = x;
if (1 == inc_x)
{
if (0.0 == da)
{
v4f32 zero_v = __msa_cast_to_vector_float(0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
for (i = (n >> 6); i--;)
{
ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 4);
ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 4);
}
if (n & 63)
{
if (n & 32)
{
ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 4);
}
if (n & 16)
{
ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
}
if (n & 8)
{
ST_SP2_INC(zero_v, zero_v, x, 4);
}
if (n & 4)
{
*x = 0; x += 1;
*x = 0; x += 1;
*x = 0; x += 1;
*x = 0; x += 1;
}
if (n & 2)
{
*x = 0; x += 1;
*x = 0; x += 1;
}
if (n & 1)
{
*x = 0;
}
}
}
else
{
da_vec = COPY_FLOAT_TO_VECTOR(da);
if (n > 63)
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64 + 32;
LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = 0; i < (n >> 6) - 1; i++)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 64;
x8 = LD_SP(px); px += 4;
x0 *= da_vec;
x9 = LD_SP(px); px += 4;
x1 *= da_vec;
x10 = LD_SP(px); px += 4;
x2 *= da_vec;
x11 = LD_SP(px); px += 4;
x3 *= da_vec;
x12 = LD_SP(px); px += 4;
x4 *= da_vec;
x13 = LD_SP(px); px += 4;
x5 *= da_vec;
x14 = LD_SP(px); px += 4;
x6 *= da_vec;
x15 = LD_SP(px); px += 4;
x7 *= da_vec;
x8 *= da_vec;
ST_SP(x0, x); x += 4;
x9 *= da_vec;
ST_SP(x1, x); x += 4;
x10 *= da_vec;
ST_SP(x2, x); x += 4;
x11 *= da_vec;
ST_SP(x3, x); x += 4;
x12 *= da_vec;
ST_SP(x4, x); x += 4;
x13 *= da_vec;
ST_SP(x5, x); x += 4;
x14 *= da_vec;
ST_SP(x6, x); x += 4;
x15 *= da_vec;
ST_SP(x7, x); x += 4;
ST_SP(x8, x); x += 4;
x0 = LD_SP(px); px += 4;
ST_SP(x9, x); x += 4;
x1 = LD_SP(px); px += 4;
ST_SP(x10, x); x += 4;
x2 = LD_SP(px); px += 4;
ST_SP(x11, x); x += 4;
x3 = LD_SP(px); px += 4;
ST_SP(x12, x); x += 4;
x4 = LD_SP(px); px += 4;
ST_SP(x13, x); x += 4;
x5 = LD_SP(px); px += 4;
ST_SP(x14, x); x += 4;
x6 = LD_SP(px); px += 4;
ST_SP(x15, x); x += 4;
x7 = LD_SP(px); px += 4;
}
x8 = LD_SP(px); px += 4;
x0 *= da_vec;
x9 = LD_SP(px); px += 4;
x1 *= da_vec;
x10 = LD_SP(px); px += 4;
x2 *= da_vec;
x11 = LD_SP(px); px += 4;
x3 *= da_vec;
x12 = LD_SP(px); px += 4;
x4 *= da_vec;
x13 = LD_SP(px); px += 4;
x5 *= da_vec;
x14 = LD_SP(px); px += 4;
x6 *= da_vec;
x15 = LD_SP(px); px += 4;
x7 *= da_vec;
x8 *= da_vec;
ST_SP(x0, x); x += 4;
x9 *= da_vec;
ST_SP(x1, x); x += 4;
x10 *= da_vec;
ST_SP(x2, x); x += 4;
x11 *= da_vec;
ST_SP(x3, x); x += 4;
x12 *= da_vec;
ST_SP(x4, x); x += 4;
x13 *= da_vec;
ST_SP(x5, x); x += 4;
x15 *= da_vec;
ST_SP(x6, x); x += 4;
x14 *= da_vec;
ST_SP(x7, x); x += 4;
ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 4);
}
if (n & 63)
{
if (n & 32)
{
LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
}
if (n & 16)
{
LD_SP4_INC(px, 4, x0, x1, x2, x3);
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
ST_SP4_INC(x0, x1, x2, x3, x, 4);
}
if (n & 8)
{
LD_SP2_INC(px, 4, x0, x1);
MUL2(x0, da_vec, x1, da_vec, x0, x1);
ST_SP2_INC(x0, x1, x, 4);
}
if (n & 4)
{
LD_GP4_INC(px, 1, f0, f1, f2, f3);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, x, 1);
}
if (n & 2)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da, f1, da, f0, f1);
ST_GP2_INC(f0, f1, x, 1);
}
if (n & 1)
{
*x *= da;
}
}
}
}
else
{
if (0.0 == da)
{
for (i = n; i--;)
{
*x = 0;
x += inc_x;
}
}
else
{
if (n > 15)
{
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
for (i = 0; i < (n >> 4) - 1; i++)
{
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
f4 *= da;
f5 *= da;
*x = f0; x += inc_x;
f6 *= da;
*x = f1; x += inc_x;
f7 *= da;
*x = f2; x += inc_x;
f8 *= da;
*x = f3; x += inc_x;
f9 *= da;
*x = f4; x += inc_x;
f10 *= da;
*x = f5; x += inc_x;
f11 *= da;
*x = f6; x += inc_x;
f12 *= da;
*x = f7; x += inc_x;
f13 *= da;
*x = f8; x += inc_x;
f14 *= da;
*x = f9; x += inc_x;
f15 *= da;
*x = f10; x += inc_x;
*x = f11; x += inc_x;
f0 = *px; px += inc_x;
*x = f12; x += inc_x;
f1 = *px; px += inc_x;
*x = f13; x += inc_x;
f2 = *px; px += inc_x;
*x = f14; x += inc_x;
f3 = *px; px += inc_x;
*x = f15; x += inc_x;
f4 = *px; px += inc_x;
f5 = *px; px += inc_x;
f6 = *px; px += inc_x;
f7 = *px; px += inc_x;
}
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
f4 *= da;
f5 *= da;
*x = f0; x += inc_x;
f6 *= da;
*x = f1; x += inc_x;
f7 *= da;
*x = f2; x += inc_x;
f8 *= da;
*x = f3; x += inc_x;
f9 *= da;
*x = f4; x += inc_x;
f10 *= da;
*x = f5; x += inc_x;
f11 *= da;
*x = f6; x += inc_x;
f12 *= da;
*x = f7; x += inc_x;
f13 *= da;
*x = f8; x += inc_x;
f14 *= da;
*x = f9; x += inc_x;
f15 *= da;
*x = f10; x += inc_x;
*x = f11; x += inc_x;
*x = f12; x += inc_x;
*x = f13; x += inc_x;
*x = f14; x += inc_x;
*x = f15; x += inc_x;
}
if (n & 15)
{
if (n & 8)
{
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
}
if (n & 4)
{
LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
}
if (n & 2)
{
LD_GP2_INC(px, inc_x, f0, f1);
MUL2(f0, da, f1, da, f0, f1);
ST_GP2_INC(f0, f1, x, inc_x);
}
if (n & 1)
{
*x *= da;
}
}
}
}
return 0;
}

267
kernel/mips/sswap_msa.c Normal file
View File

@ -0,0 +1,267 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i = 0, pref_offsetx, pref_offsety;
FLOAT *px, *py;
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
if (n < 0) return (0);
pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
if (pref_offsetx > 0)
{
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
pref_offsetx = pref_offsetx / sizeof(FLOAT);
}
pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
if (pref_offsety > 0)
{
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
pref_offsety = pref_offsety / sizeof(FLOAT);
}
px = srcx;
py = srcy;
if ((1 == inc_x) && (1 == inc_y))
{
if (n >> 5)
{
LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
for (i = (n >> 5) - 1; i--;)
{
PREFETCH(px + pref_offsetx + 32);
PREFETCH(px + pref_offsetx + 40);
PREFETCH(px + pref_offsetx + 48);
PREFETCH(px + pref_offsetx + 56);
PREFETCH(py + pref_offsety + 32);
PREFETCH(py + pref_offsety + 40);
PREFETCH(py + pref_offsety + 48);
PREFETCH(py + pref_offsety + 56);
yv0 = LD_SP(py); py += 4;
ST_SP(xv0, srcy); srcy += 4;
yv1 = LD_SP(py); py += 4;
ST_SP(xv1, srcy); srcy += 4;
yv2 = LD_SP(py); py += 4;
ST_SP(xv2, srcy); srcy += 4;
yv3 = LD_SP(py); py += 4;
ST_SP(xv3, srcy); srcy += 4;
yv4 = LD_SP(py); py += 4;
ST_SP(xv4, srcy); srcy += 4;
yv5 = LD_SP(py); py += 4;
ST_SP(xv5, srcy); srcy += 4;
yv6 = LD_SP(py); py += 4;
ST_SP(xv6, srcy); srcy += 4;
yv7 = LD_SP(py); py += 4;
ST_SP(xv7, srcy); srcy += 4;
xv0 = LD_SP(px); px += 4;
ST_SP(yv0, srcx); srcx += 4;
xv1 = LD_SP(px); px += 4;
ST_SP(yv1, srcx); srcx += 4;
xv2 = LD_SP(px); px += 4;
ST_SP(yv2, srcx); srcx += 4;
xv3 = LD_SP(px); px += 4;
ST_SP(yv3, srcx); srcx += 4;
xv4 = LD_SP(px); px += 4;
ST_SP(yv4, srcx); srcx += 4;
xv5 = LD_SP(px); px += 4;
ST_SP(yv5, srcx); srcx += 4;
xv6 = LD_SP(px); px += 4;
ST_SP(yv6, srcx); srcx += 4;
xv7 = LD_SP(px); px += 4;
ST_SP(yv7, srcx); srcx += 4;
}
LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4);
ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4);
}
if (n & 31)
{
if ((n & 16) && (n & 8) && (n & 4))
{
LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4);
ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4);
}
else if ((n & 16) && (n & 8))
{
LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5);
LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5);
ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4);
ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4);
}
else if ((n & 16) && (n & 4))
{
LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4);
LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4);
ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4);
ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4);
}
else if ((n & 8) && (n & 4))
{
LD_SP3_INC(px, 4, xv0, xv1, xv2);
LD_SP3_INC(py, 4, yv0, yv1, yv2);
ST_SP3_INC(xv0, xv1, xv2, srcy, 4);
ST_SP3_INC(yv0, yv1, yv2, srcx, 4);
}
else if (n & 16)
{
LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3);
LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3);
ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4);
ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4);
}
else if (n & 8)
{
LD_SP2_INC(px, 4, xv0, xv1);
LD_SP2_INC(py, 4, yv0, yv1);
ST_SP2_INC(xv0, xv1, srcy, 4);
ST_SP2_INC(yv0, yv1, srcx, 4);
}
else if (n & 4)
{
xv0 = LD_SP(px);
yv0 = LD_SP(py);
px += 4;
py += 4;
ST_SP(xv0, srcy);
ST_SP(yv0, srcx);
srcx += 4;
srcy += 4;
}
if ((n & 2) && (n & 1))
{
LD_GP3_INC(px, 1, x0, x1, x3);
LD_GP3_INC(py, 1, y0, y1, y3);
ST_GP3_INC(x0, x1, x3, srcy, 1);
ST_GP3_INC(y0, y1, y3, srcx, 1);
}
else if (n & 2)
{
LD_GP2_INC(px, 1, x0, x1);
LD_GP2_INC(py, 1, y0, y1);
ST_GP2_INC(x0, x1, srcy, 1);
ST_GP2_INC(y0, y1, srcx, 1);
}
else if (n & 1)
{
x0 = px[0];
y0 = py[0];
srcx[0] = y0;
srcy[0] = x0;
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
}
if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
}
else if ((n & 4) && (n & 2))
{
LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
}
else if ((n & 4) && (n & 1))
{
LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
}
else if ((n & 2) && (n & 1))
{
LD_GP3_INC(px, inc_x, x0, x1, x2);
LD_GP3_INC(py, inc_y, y0, y1, y2);
ST_GP3_INC(x0, x1, x2, srcy, inc_y);
ST_GP3_INC(y0, y1, y2, srcx, inc_x);
}
else if (n & 4)
{
LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
}
else if (n & 2)
{
LD_GP2_INC(px, inc_x, x0, x1);
LD_GP2_INC(py, inc_y, y0, y1);
ST_GP2_INC(x0, x1, srcy, inc_y);
ST_GP2_INC(y0, y1, srcx, inc_x);
}
else if (n & 1)
{
x0 = *srcx;
y0 = *srcy;
*srcx = y0;
*srcy = x0;
}
}
}
return (0);
}

494
kernel/mips/zaxpy_msa.c Normal file
View File

@ -0,0 +1,494 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
#if !defined(CONJ)
#define OP0 +=
#define OP1 -=
#define OP2 +=
#else
#define OP0 -=
#define OP1 +=
#define OP2 -=
#endif
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i, inc_x2, inc_y2;
FLOAT *py;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7;
v2f64 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec;
v2f64 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i;
v2f64 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i;
FLOAT xd0, xd1, yd0, yd1;
if (n < 0) return(0);
if ((da_r == 0.0) && (da_i == 0.0)) return(0);
py = y;
dar_vec = COPY_DOUBLE_TO_VECTOR(da_r);
dai_vec = COPY_DOUBLE_TO_VECTOR(da_i);
if ((1 == inc_x) && (1 == inc_y))
{
FLOAT *x_pref, *y_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32;
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 32;
for (i = (n >> 3); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
x_pref += 16;
y_pref += 16;
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
PCKEVOD_D2_DP(y7, y6, y3r, y3i);
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ILVRL_D2_DP(y2i, y2r, y4, y5);
ILVRL_D2_DP(y3i, y3r, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
}
if (n & 7)
{
if (n & 4)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
LD_DP4_INC(py, 2, y0, y1, y2, y3);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, 2);
}
if (n & 2)
{
LD_DP2_INC(x, 2, x0, x1);
LD_DP2_INC(py, 2, y0, y1);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ST_DP2_INC(y0, y1, y, 2);
}
if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);
yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;
ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else if (1 == inc_y)
{
FLOAT *y_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
y_pref = y + pref_offset + 32;
inc_x2 = 2 * inc_x;
for (i = (n >> 3); i--;)
{
PREF_OFFSET(y_pref, 0);
PREF_OFFSET(y_pref, 32);
PREF_OFFSET(y_pref, 64);
PREF_OFFSET(y_pref, 96);
y_pref += 16;
LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
PCKEVOD_D2_DP(y7, y6, y3r, y3i);
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ILVRL_D2_DP(y2i, y2r, y4, y5);
ILVRL_D2_DP(y3i, y3r, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
}
if (n & 7)
{
if (n & 4)
{
LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
LD_DP4_INC(py, 2, y0, y1, y2, y3);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, 2);
}
if (n & 2)
{
LD_DP2_INC(x, inc_x2, x0, x1);
LD_DP2_INC(py, 2, y0, y1);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ST_DP2_INC(y0, y1, y, 2);
}
if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);
yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;
ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else if (1 == inc_x)
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32;
inc_y2 = 2 * inc_y;
for (i = (n >> 3); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
x_pref += 16;
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
PCKEVOD_D2_DP(y7, y6, y3r, y3i);
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ILVRL_D2_DP(y2i, y2r, y4, y5);
ILVRL_D2_DP(y3i, y3r, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
}
if (n & 7)
{
if (n & 4)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
}
if (n & 2)
{
LD_DP2_INC(x, 2, x0, x1);
LD_DP2_INC(py, inc_y2, y0, y1);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ST_DP2_INC(y0, y1, y, inc_y2);
}
if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);
yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;
ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
else
{
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
for (i = (n >> 3); i--;)
{
LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
PCKEVOD_D2_DP(y7, y6, y3r, y3i);
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y2i OP0 dar_vec * x2i;
y3i OP0 dar_vec * x3i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y2r OP1 dai_vec * x2i;
y3r OP1 dai_vec * x3i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
y2i OP2 dai_vec * x2r;
y3i OP2 dai_vec * x3r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ILVRL_D2_DP(y2i, y2r, y4, y5);
ILVRL_D2_DP(y3i, y3r, y6, y7);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
}
if (n & 7)
{
if (n & 4)
{
LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
y0i OP0 dar_vec * x0i;
y1i OP0 dar_vec * x1i;
y0r OP1 dai_vec * x0i;
y1r OP1 dai_vec * x1i;
y0i OP2 dai_vec * x0r;
y1i OP2 dai_vec * x1r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ILVRL_D2_DP(y1i, y1r, y2, y3);
ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
}
if (n & 2)
{
LD_DP2_INC(x, inc_x2, x0, x1);
LD_DP2_INC(py, inc_y2, y0, y1);
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
y0r += dar_vec * x0r;
y0i OP0 dar_vec * x0i;
y0r OP1 dai_vec * x0i;
y0i OP2 dai_vec * x0r;
ILVRL_D2_DP(y0i, y0r, y0, y1);
ST_DP2_INC(y0, y1, y, inc_y2);
}
if (n & 1)
{
LD_GP2_INC(x, 1, xd0, xd1);
LD_GP2_INC(py, 1, yd0, yd1);
yd0 += da_r * xd0;
yd1 OP0 da_r * xd1;
yd0 OP1 da_i * xd1;
yd1 OP2 da_i * xd0;
ST_GP2_INC(yd0, yd1, y, 1);
}
}
}
return (0);
}

218
kernel/mips/zcopy_msa.c Normal file
View File

@ -0,0 +1,218 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
{
BLASLONG i;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
FLOAT f0, f1;
if (n < 0) return (0);
if ((1 == inc_x) && (1 == inc_y))
{
if (n > 15)
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 64 + 16;
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 4) - 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;
x8 = LD_DP(x); x += 2;
ST_DP(x0, y); y += 2;
x9 = LD_DP(x); x += 2;
ST_DP(x1, y); y += 2;
x10 = LD_DP(x); x += 2;
ST_DP(x2, y); y += 2;
x11 = LD_DP(x); x += 2;
ST_DP(x3, y); y += 2;
x12 = LD_DP(x); x += 2;
ST_DP(x4, y); y += 2;
x13 = LD_DP(x); x += 2;
ST_DP(x5, y); y += 2;
x14 = LD_DP(x); x += 2;
ST_DP(x6, y); y += 2;
x15 = LD_DP(x); x += 2;
ST_DP(x7, y); y += 2;
x0 = LD_DP(x); x += 2;
ST_DP(x8, y); y += 2;
x1 = LD_DP(x); x += 2;
ST_DP(x9, y); y += 2;
x2 = LD_DP(x); x += 2;
ST_DP(x10, y); y += 2;
x3 = LD_DP(x); x += 2;
ST_DP(x11, y); y += 2;
x4 = LD_DP(x); x += 2;
ST_DP(x12, y); y += 2;
x5 = LD_DP(x); x += 2;
ST_DP(x13, y); y += 2;
x6 = LD_DP(x); x += 2;
ST_DP(x14, y); y += 2;
x7 = LD_DP(x); x += 2;
ST_DP(x15, y); y += 2;
}
x8 = LD_DP(x); x += 2;
x9 = LD_DP(x); x += 2;
ST_DP(x0, y); y += 2;
x10 = LD_DP(x); x += 2;
ST_DP(x1, y); y += 2;
x11 = LD_DP(x); x += 2;
ST_DP(x2, y); y += 2;
x12 = LD_DP(x); x += 2;
ST_DP(x3, y); y += 2;
x13 = LD_DP(x); x += 2;
ST_DP(x4, y); y += 2;
x14 = LD_DP(x); x += 2;
ST_DP(x5, y); y += 2;
x15 = LD_DP(x); x += 2;
ST_DP(x6, y); y += 2;
ST_DP(x7, y); y += 2;
ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2);
}
if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2);
}
if (n & 4)
{
LD_DP4_INC(x, 2, x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, y, 2);
}
if (n & 2)
{
LD_DP2_INC(x, 2, x0, x1);
ST_DP2_INC(x0, x1, y, 2);
}
if (n & 1)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
}
}
else
{
inc_x *= 2;
inc_y *= 2;
for (i = (n >> 4); i--;)
{
x0 = LD_DP(x); x += inc_x;
x1 = LD_DP(x); x += inc_x;
x2 = LD_DP(x); x += inc_x;
x3 = LD_DP(x); x += inc_x;
x4 = LD_DP(x); x += inc_x;
x5 = LD_DP(x); x += inc_x;
x6 = LD_DP(x); x += inc_x;
x7 = LD_DP(x); x += inc_x;
x8 = LD_DP(x); x += inc_x;
ST_DP(x0, y); y += inc_y;
x9 = LD_DP(x); x += inc_x;
ST_DP(x1, y); y += inc_y;
x10 = LD_DP(x); x += inc_x;
ST_DP(x2, y); y += inc_y;
x11 = LD_DP(x); x += inc_x;
ST_DP(x3, y); y += inc_y;
x12 = LD_DP(x); x += inc_x;
ST_DP(x4, y); y += inc_y;
x13 = LD_DP(x); x += inc_x;
ST_DP(x5, y); y += inc_y;
x14 = LD_DP(x); x += inc_x;
ST_DP(x6, y); y += inc_y;
x15 = LD_DP(x); x += inc_x;
ST_DP(x7, y); y += inc_y;
ST_DP(x8, y); y += inc_y;
ST_DP(x9, y); y += inc_y;
ST_DP(x10, y); y += inc_y;
ST_DP(x11, y); y += inc_y;
ST_DP(x12, y); y += inc_y;
ST_DP(x13, y); y += inc_y;
ST_DP(x14, y); y += inc_y;
ST_DP(x15, y); y += inc_y;
}
if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, inc_y);
}
if (n & 4)
{
LD_DP4_INC(x, inc_x, x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, y, inc_y);
}
if (n & 2)
{
LD_DP2_INC(x, inc_x, x0, x1);
ST_DP2_INC(x0, x1, y, inc_y);
}
if (n & 1)
{
LD_GP2_INC(x, 1, f0, f1);
ST_GP2_INC(f0, f1, y, 1);
}
}
}
return (0);
}

717
kernel/mips/zscal_msa.c Normal file
View File

@ -0,0 +1,717 @@
/*******************************************************************************
Copyright (c) 2017, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
/* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10)
0 1 2 3 => 2 3 0 1 */
#define SHF_78 78
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
BLASLONG dummy2)
{
BLASLONG i, inc_x2;
FLOAT *px;
FLOAT tp0, tp1, f0, f1;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
v2f64 da_i_vec, da_i_vec_neg, da_r_vec;
px = x;
if (1 == inc_x)
{
if ((0.0 == da_r) && (0.0 == da_i))
{
v2f64 zero_v = __msa_cast_to_vector_double(0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
for (i = (n >> 4); i--;)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
}
if (n & 15)
{
if (n & 8)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, 2);
}
if (n & 4)
{
ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
}
if (n & 2)
{
ST_DP2_INC(zero_v, zero_v, x, 2);
}
if (n & 1)
{
ST_DP(zero_v, x);
}
}
}
else if (0.0 == da_r)
{
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
da_i_vec_neg = -da_i_vec;
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
if (n > 15)
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32 + 16;
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 4)- 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;
x8 = LD_DP(px); px += 2;
x0 *= da_i_vec;
x9 = LD_DP(px); px += 2;
x1 *= da_i_vec;
x10 = LD_DP(px); px += 2;
x2 *= da_i_vec;
x11 = LD_DP(px); px += 2;
x3 *= da_i_vec;
x12 = LD_DP(px); px += 2;
x4 *= da_i_vec;
x13 = LD_DP(px); px += 2;
x5 *= da_i_vec;
x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78);
x14 = LD_DP(px); px += 2;
x6 *= da_i_vec;
x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78);
x15 = LD_DP(px); px += 2;
x7 *= da_i_vec;
x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78);
x8 *= da_i_vec;
x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78);
ST_DP(x0, x); x += 2;
x9 *= da_i_vec;
x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78);
ST_DP(x1, x); x += 2;
x10 *= da_i_vec;
x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78);
ST_DP(x2, x); x += 2;
x11 *= da_i_vec;
x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78);
ST_DP(x3, x); x += 2;
x12 *= da_i_vec;
x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78);
ST_DP(x4, x); x += 2;
x13 *= da_i_vec;
x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78);
ST_DP(x5, x); x += 2;
x14 *= da_i_vec;
x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78);
ST_DP(x6, x); x += 2;
x15 *= da_i_vec;
x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78);
ST_DP(x7, x); x += 2;
x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78);
ST_DP(x8, x); x += 2;
x0 = LD_DP(px); px += 2;
x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78);
ST_DP(x9, x); x += 2;
x1 = LD_DP(px); px += 2;
x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78);
ST_DP(x10, x); x += 2;
x2 = LD_DP(px); px += 2;
x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78);
ST_DP(x11, x); x += 2;
x3 = LD_DP(px); px += 2;
x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78);
ST_DP(x12, x); x += 2;
x4 = LD_DP(px); px += 2;
ST_DP(x13, x); x += 2;
x5 = LD_DP(px); px += 2;
ST_DP(x14, x); x += 2;
x6 = LD_DP(px); px += 2;
ST_DP(x15, x); x += 2;
x7 = LD_DP(px); px += 2;
}
LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
x4, x5, x6, x7);
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
x8, x9, x10, x11);
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
x12, x13, x14, x15);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
x12, x13, x14, x15, x, 2);
}
if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
x4, x5, x6, x7);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
}
if (n & 4)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
ST_DP4_INC(x0, x1, x2, x3, x, 2);
}
if (n & 2)
{
LD_DP2_INC(px, 2, x0, x1);
MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
SHF_W2_DP(x0, x1, x0, x1, SHF_78);
ST_DP2_INC(x0, x1, x, 2);
}
if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da_i, f1, -da_i, f0, f1);
ST_GP2_INC(f1, f0, x, 1);
}
}
}
else if (0.0 == da_i)
{
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
if (n > 15)
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32 + 16;
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 4)- 1; i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;
x8 = LD_DP(px); px += 2;
x0 *= da_r_vec;
x9 = LD_DP(px); px += 2;
x1 *= da_r_vec;
x10 = LD_DP(px); px += 2;
x2 *= da_r_vec;
x11 = LD_DP(px); px += 2;
x3 *= da_r_vec;
x12 = LD_DP(px); px += 2;
x4 *= da_r_vec;
x13 = LD_DP(px); px += 2;
x5 *= da_r_vec;
ST_DP(x0, x); x += 2;
x14 = LD_DP(px); px += 2;
x6 *= da_r_vec;
ST_DP(x1, x); x += 2;
x15 = LD_DP(px); px += 2;
x7 *= da_r_vec;
ST_DP(x2, x); x += 2;
x8 *= da_r_vec;
ST_DP(x3, x); x += 2;
x9 *= da_r_vec;
ST_DP(x4, x); x += 2;
x10 *= da_r_vec;
ST_DP(x5, x); x += 2;
x11 *= da_r_vec;
ST_DP(x6, x); x += 2;
x12 *= da_r_vec;
ST_DP(x7, x); x += 2;
x13 *= da_r_vec;
ST_DP(x8, x); x += 2;
x0 = LD_DP(px); px += 2;
x14 *= da_r_vec;
ST_DP(x9, x); x += 2;
x1 = LD_DP(px); px += 2;
x15 *= da_r_vec;
ST_DP(x10, x); x += 2;
x2 = LD_DP(px); px += 2;
ST_DP(x11, x); x += 2;
x3 = LD_DP(px); px += 2;
ST_DP(x12, x); x += 2;
x4 = LD_DP(px); px += 2;
ST_DP(x13, x); x += 2;
x5 = LD_DP(px); px += 2;
ST_DP(x14, x); x += 2;
x6 = LD_DP(px); px += 2;
ST_DP(x15, x); x += 2;
x7 = LD_DP(px); px += 2;
}
LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
x4, x5, x6, x7);
MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
x8, x9, x10, x11);
MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
x12, x13, x14, x15);
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
x12, x13, x14, x15, x, 2);
}
if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
}
if (n & 4)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, x, 2);
}
if (n & 2)
{
LD_DP2_INC(px, 2, x0, x1);
MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
ST_DP2_INC(x0, x1, x, 2);
}
if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da_r, f1, da_r, f0, f1);
ST_GP2_INC(f0, f1, x, 1);
}
}
}
else
{
FLOAT *x_pref;
BLASLONG pref_offset;
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
if (pref_offset > 0)
{
pref_offset = L1_DATA_LINESIZE - pref_offset;
pref_offset = pref_offset / sizeof(FLOAT);
}
x_pref = x + pref_offset + 32;
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
da_i_vec_neg = -da_i_vec;
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
for (i = (n >> 4); i--;)
{
PREF_OFFSET(x_pref, 0);
PREF_OFFSET(x_pref, 32);
PREF_OFFSET(x_pref, 64);
PREF_OFFSET(x_pref, 96);
PREF_OFFSET(x_pref, 128);
PREF_OFFSET(x_pref, 160);
PREF_OFFSET(x_pref, 192);
PREF_OFFSET(x_pref, 224);
x_pref += 32;
LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
x11, x12, x13, x14, x15);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
d4, d5, d6, d7);
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
d8, d9, d10, d11);
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
d12, d13, d14, d15);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
d12, d13, d14, d15, x, 2);
}
if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
d4, d5, d6, d7);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2);
}
if (n & 4)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
ST_DP4_INC(d0, d1, d2, d3, x, 2);
}
if (n & 2)
{
LD_DP2_INC(px, 2, x0, x1);
MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
SHF_W2_DP(d0, d1, d0, d1, SHF_78);
FMADD2(x0, x1, da_r_vec, d0, d1);
ST_DP2_INC(d0, d1, x, 2);
}
if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
tp0 = da_r * f0;
tp0 -= da_i * f1;
tp1 = da_r * f1;
tp1 += da_i * f0;
ST_GP2_INC(tp0, tp1, x, 1);
}
}
}
}
else
{
inc_x2 = 2 * inc_x;
if ((0.0 == da_r) && (0.0 == da_i))
{
v2f64 zero_v = __msa_cast_to_vector_double(0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
for (i = (n >> 4); i--;)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, inc_x2);
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, inc_x2);
}
if (n & 15)
{
if (n & 8)
{
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
zero_v, zero_v, x, inc_x2);
}
if (n & 4)
{
ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2);
}
if (n & 2)
{
ST_DP2_INC(zero_v, zero_v, x, inc_x2);
}
if (n & 1)
{
ST_DP(zero_v, x);
}
}
}
else if (0.0 == da_r)
{
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
da_i_vec_neg = -da_i_vec;
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
for (i = (n >> 4); i--;)
{
LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
x10, x11, x12, x13, x14, x15);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
x4, x5, x6, x7);
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
x8, x9, x10, x11);
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
x12, x13, x14, x15);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
x12, x13, x14, x15, x, inc_x2);
}
if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
x4, x5, x6, x7);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
}
if (n & 4)
{
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
x0, x1, x2, x3);
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
}
if (n & 2)
{
LD_DP2_INC(px, inc_x2, x0, x1);
MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
SHF_W2_DP(x0, x1, x0, x1, SHF_78);
ST_DP2_INC(x0, x1, x, inc_x2);
}
if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da_i, f1, -da_i, f0, f1);
ST_GP2_INC(f1, f0, x, 1);
}
}
}
else if (0.0 == da_i)
{
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
for (i = (n >> 4); i--;)
{
LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
x10, x11, x12, x13, x14, x15);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
x4, x5, x6, x7);
MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
x8, x9, x10, x11);
MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
x12, x13, x14, x15);
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
x12, x13, x14, x15, x, inc_x2);
}
if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
x4, x5, x6, x7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
}
if (n & 4)
{
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
x0, x1, x2, x3);
ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
}
if (n & 2)
{
LD_DP2_INC(px, inc_x2, x0, x1);
MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
ST_DP2_INC(x0, x1, x, inc_x2);
}
if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
MUL2(f0, da_r, f1, da_r, f0, f1);
ST_GP2_INC(f0, f1, x, 1);
}
}
}
else
{
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
da_i_vec_neg = -da_i_vec;
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
for (i = (n >> 4); i--;)
{
LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
x10, x11, x12, x13, x14, x15);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
d4, d5, d6, d7);
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
d8, d9, d10, d11);
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
d12, d13, d14, d15);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
d12, d13, d14, d15, x, inc_x2);
}
if (n & 15)
{
if (n & 8)
{
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
d4, d5, d6, d7);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2);
}
if (n & 4)
{
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
d0, d1, d2, d3);
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
ST_DP4_INC(d0, d1, d2, d3, x, inc_x2);
}
if (n & 2)
{
LD_DP2_INC(px, inc_x2, x0, x1);
MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
SHF_W2_DP(d0, d1, d0, d1, SHF_78);
FMADD2(x0, x1, da_r_vec, d0, d1);
ST_DP2_INC(d0, d1, x, inc_x2);
}
if (n & 1)
{
LD_GP2_INC(px, 1, f0, f1);
tp0 = da_r * f0;
tp0 -= da_i * f1;
tp1 = da_r * f1;
tp1 += da_i * f0;
ST_GP2_INC(tp0, tp1, x, 1);
}
}
}
}
return (0);
}

238
kernel/mips/zswap_msa.c Normal file
View File

@ -0,0 +1,238 @@
/*******************************************************************************
Copyright (c) 2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#include "common.h"
#include "macros_msa.h"
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy,
BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i, inc_x2, inc_y2, pref_offsetx, pref_offsety;
FLOAT *px, *py;
v2f64 x0, x1, x2, x3, x4, x5, x6, x7;
v2f64 y0, y1, y2, y3, y4, y5, y6, y7;
if (n < 0) return (0);
pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
if (pref_offsetx > 0)
{
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
pref_offsetx = pref_offsetx / sizeof(FLOAT);
}
pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
if (pref_offsety > 0)
{
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
pref_offsety = pref_offsety / sizeof(FLOAT);
}
inc_x2 = 2 * inc_x;
inc_y2 = 2 * inc_y;
px = srcx;
py = srcy;
if ((1 == inc_x) && (1 == inc_y))
{
if (n >> 3)
{
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
for (i = (n >> 3) - 1; i--;)
{
PREFETCH(px + pref_offsetx + 16);
PREFETCH(px + pref_offsetx + 20);
PREFETCH(px + pref_offsetx + 24);
PREFETCH(px + pref_offsetx + 28);
PREFETCH(py + pref_offsety + 16);
PREFETCH(py + pref_offsety + 20);
PREFETCH(py + pref_offsety + 24);
PREFETCH(py + pref_offsety + 28);
y0 = LD_DP(py); py += 2;
ST_DP(x0, srcy); srcy += 2;
y1 = LD_DP(py); py += 2;
ST_DP(x1, srcy); srcy += 2;
y2 = LD_DP(py); py += 2;
ST_DP(x2, srcy); srcy += 2;
y3 = LD_DP(py); py += 2;
ST_DP(x3, srcy); srcy += 2;
y4 = LD_DP(py); py += 2;
ST_DP(x4, srcy); srcy += 2;
y5 = LD_DP(py); py += 2;
ST_DP(x5, srcy); srcy += 2;
y6 = LD_DP(py); py += 2;
ST_DP(x6, srcy); srcy += 2;
y7 = LD_DP(py); py += 2;
ST_DP(x7, srcy); srcy += 2;
x0 = LD_DP(px); px += 2;
ST_DP(y0, srcx); srcx += 2;
x1 = LD_DP(px); px += 2;
ST_DP(y1, srcx); srcx += 2;
x2 = LD_DP(px); px += 2;
ST_DP(y2, srcx); srcx += 2;
x3 = LD_DP(px); px += 2;
ST_DP(y3, srcx); srcx += 2;
x4 = LD_DP(px); px += 2;
ST_DP(y4, srcx); srcx += 2;
x5 = LD_DP(px); px += 2;
ST_DP(y5, srcx); srcx += 2;
x6 = LD_DP(px); px += 2;
ST_DP(y6, srcx); srcx += 2;
x7 = LD_DP(px); px += 2;
ST_DP(y7, srcx); srcx += 2;
}
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, 2);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, 2);
}
if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_DP7_INC(px, 2, x0, x1, x2, x3, x4, x5, x6);
LD_DP7_INC(py, 2, y0, y1, y2, y3, y4, y5, y6);
ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, 2);
ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, 2);
}
else if ((n & 4) && (n & 2))
{
LD_DP6_INC(px, 2, x0, x1, x2, x3, x4, x5);
LD_DP6_INC(py, 2, y0, y1, y2, y3, y4, y5);
ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, 2);
ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, 2);
}
else if ((n & 4) && (n & 1))
{
LD_DP5_INC(px, 2, x0, x1, x2, x3, x4);
LD_DP5_INC(py, 2, y0, y1, y2, y3, y4);
ST_DP5_INC(x0, x1, x2, x3, x4, srcy, 2);
ST_DP5_INC(y0, y1, y2, y3, y4, srcx, 2);
}
else if ((n & 2) && (n & 1))
{
LD_DP3_INC(px, 2, x0, x1, x2);
LD_DP3_INC(py, 2, y0, y1, y2);
ST_DP3_INC(x0, x1, x2, srcy, 2);
ST_DP3_INC(y0, y1, y2, srcx, 2);
}
else if (n & 4)
{
LD_DP4_INC(px, 2, x0, x1, x2, x3);
LD_DP4_INC(py, 2, y0, y1, y2, y3);
ST_DP4_INC(x0, x1, x2, x3, srcy, 2);
ST_DP4_INC(y0, y1, y2, y3, srcx, 2);
}
else if (n & 2)
{
LD_DP2_INC(px, 2, x0, x1);
LD_DP2_INC(py, 2, y0, y1);
ST_DP2_INC(x0, x1, srcy, 2);
ST_DP2_INC(y0, y1, srcx, 2);
}
else if (n & 1)
{
x0 = LD_DP(px);
y0 = LD_DP(py);
ST_DP(y0, srcx);
ST_DP(x0, srcy);
}
}
}
else
{
for (i = (n >> 3); i--;)
{
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y2);
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x2);
}
if (n & 7)
{
if ((n & 4) && (n & 2) && (n & 1))
{
LD_DP7_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6);
LD_DP7_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6);
ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y2);
ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x2);
}
else if ((n & 4) && (n & 2))
{
LD_DP6_INC(px, inc_x2, x0, x1, x2, x3, x4, x5);
LD_DP6_INC(py, inc_y2, y0, y1, y2, y3, y4, y5);
ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y2);
ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x2);
}
else if ((n & 4) && (n & 1))
{
LD_DP5_INC(px, inc_x2, x0, x1, x2, x3, x4);
LD_DP5_INC(py, inc_y2, y0, y1, y2, y3, y4);
ST_DP5_INC(x0, x1, x2, x3, x4, srcy, inc_y2);
ST_DP5_INC(y0, y1, y2, y3, y4, srcx, inc_x2);
}
else if ((n & 2) && (n & 1))
{
LD_DP3_INC(px, inc_x2, x0, x1, x2);
LD_DP3_INC(py, inc_y2, y0, y1, y2);
ST_DP3_INC(x0, x1, x2, srcy, inc_y2);
ST_DP3_INC(y0, y1, y2, srcx, inc_x2);
}
else if (n & 4)
{
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
ST_DP4_INC(x0, x1, x2, x3, srcy, inc_y2);
ST_DP4_INC(y0, y1, y2, y3, srcx, inc_x2);
}
else if (n & 2)
{
LD_DP2_INC(px, inc_x2, x0, x1);
LD_DP2_INC(py, inc_y2, y0, y1);
ST_DP2_INC(x0, x1, srcy, inc_y2);
ST_DP2_INC(y0, y1, srcx, inc_x2);
}
else if (n & 1)
{
x0 = LD_DP(px);
y0 = LD_DP(py);
ST_DP(y0, srcx);
ST_DP(x0, srcy);
}
}
}
return (0);
}