Add msa optimization for AXPY, COPY, SCALE, SWAP
Signed-off-by: kaustubh <kaustubh.raste@imgtec.com>
This commit is contained in:
parent
2ffbbb54f6
commit
88afb3bc94
|
@ -42,15 +42,29 @@ CASUMKERNEL = ../mips/asum.c
|
|||
ZASUMKERNEL = ../mips/asum.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SAXPYKERNEL = ../mips/saxpy_msa.c
|
||||
DAXPYKERNEL = ../mips/daxpy_msa.c
|
||||
CAXPYKERNEL = ../mips/caxpy_msa.c
|
||||
ZAXPYKERNEL = ../mips/zaxpy_msa.c
|
||||
else
|
||||
SAXPYKERNEL = ../mips/axpy.c
|
||||
DAXPYKERNEL = ../mips/axpy.c
|
||||
CAXPYKERNEL = ../mips/zaxpy.c
|
||||
ZAXPYKERNEL = ../mips/zaxpy.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SCOPYKERNEL = ../mips/scopy_msa.c
|
||||
DCOPYKERNEL = ../mips/dcopy_msa.c
|
||||
CCOPYKERNEL = ../mips/ccopy_msa.c
|
||||
ZCOPYKERNEL = ../mips/zcopy_msa.c
|
||||
else
|
||||
SCOPYKERNEL = ../mips/copy.c
|
||||
DCOPYKERNEL = ../mips/copy.c
|
||||
CCOPYKERNEL = ../mips/zcopy.c
|
||||
ZCOPYKERNEL = ../mips/zcopy.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SDOTKERNEL = ../mips/sdot_msa.c
|
||||
|
@ -74,15 +88,29 @@ DROTKERNEL = ../mips/rot.c
|
|||
CROTKERNEL = ../mips/zrot.c
|
||||
ZROTKERNEL = ../mips/zrot.c
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SSCALKERNEL = ../mips/sscal_msa.c
|
||||
DSCALKERNEL = ../mips/dscal_msa.c
|
||||
CSCALKERNEL = ../mips/cscal_msa.c
|
||||
ZSCALKERNEL = ../mips/zscal_msa.c
|
||||
else
|
||||
SSCALKERNEL = ../mips/scal.c
|
||||
DSCALKERNEL = ../mips/scal.c
|
||||
CSCALKERNEL = ../mips/zscal.c
|
||||
ZSCALKERNEL = ../mips/zscal.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SSWAPKERNEL = ../mips/sswap.c
|
||||
DSWAPKERNEL = ../mips/dswap.c
|
||||
CSWAPKERNEL = ../mips/cswap.c
|
||||
ZSWAPKERNEL = ../mips/zswap.c
|
||||
else
|
||||
SSWAPKERNEL = ../mips/swap.c
|
||||
DSWAPKERNEL = ../mips/swap.c
|
||||
CSWAPKERNEL = ../mips/zswap.c
|
||||
ZSWAPKERNEL = ../mips/zswap.c
|
||||
endif
|
||||
|
||||
ifdef HAVE_MSA
|
||||
SGEMVNKERNEL = ../mips/sgemv_n_msa.c
|
||||
|
|
|
@ -0,0 +1,471 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#if !defined(CONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||
BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i, inc_x2, inc_y2;
|
||||
FLOAT *py;
|
||||
v4f32 x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
v4f32 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec;
|
||||
v4f32 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i;
|
||||
v4f32 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i;
|
||||
FLOAT xd0, xd1, xd2, xd3, xd4, xd5, xd6, xd7;
|
||||
FLOAT yd0, yd1, yd2, yd3, yd4, yd5, yd6, yd7;
|
||||
|
||||
if (n < 0) return(0);
|
||||
if ((da_r == 0.0) && (da_i == 0.0)) return(0);
|
||||
|
||||
py = y;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
FLOAT *x_pref, *y_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 64;
|
||||
|
||||
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
y_pref = y + pref_offset + 64;
|
||||
|
||||
dar_vec = COPY_FLOAT_TO_VECTOR(da_r);
|
||||
dai_vec = COPY_FLOAT_TO_VECTOR(da_i);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(y_pref, 0);
|
||||
PREF_OFFSET(y_pref, 32);
|
||||
PREF_OFFSET(y_pref, 64);
|
||||
PREF_OFFSET(y_pref, 96);
|
||||
x_pref += 32;
|
||||
y_pref += 32;
|
||||
|
||||
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
|
||||
PCKEVOD_W2_SP(x5, x4, x2r, x2i);
|
||||
PCKEVOD_W2_SP(y5, y4, y2r, y2i);
|
||||
PCKEVOD_W2_SP(x7, x6, x3r, x3i);
|
||||
PCKEVOD_W2_SP(y7, y6, y3r, y3i);
|
||||
|
||||
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y2i OP0 dar_vec * x2i;
|
||||
y3i OP0 dar_vec * x3i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y2r OP1 dai_vec * x2i;
|
||||
y3r OP1 dai_vec * x3i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
y2i OP2 dai_vec * x2r;
|
||||
y3i OP2 dai_vec * x3r;
|
||||
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1);
|
||||
ILVRL_W2_SP(y1i, y1r, y2, y3);
|
||||
ILVRL_W2_SP(y2i, y2r, y4, y5);
|
||||
ILVRL_W2_SP(y3i, y3r, y6, y7);
|
||||
ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_SP4_INC(x, 4, x0, x1, x2, x3);
|
||||
LD_SP4_INC(py, 4, y0, y1, y2, y3);
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
|
||||
|
||||
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1);
|
||||
ILVRL_W2_SP(y1i, y1r, y2, y3);
|
||||
ST_SP4_INC(y0, y1, y2, y3, y, 4);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_SP2_INC(x, 4, x0, x1);
|
||||
LD_SP2_INC(py, 4, y0, y1);
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
|
||||
|
||||
y0r += dar_vec * x0r;
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1);
|
||||
ST_SP2_INC(y0, y1, y, 4);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP4_INC(x, 1, xd0, xd1, xd2, xd3);
|
||||
LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3);
|
||||
|
||||
FMADD2(xd0, xd2, da_r, yd0, yd2);
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd3 OP0 da_r * xd3;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd2 OP1 da_i * xd3;
|
||||
yd1 OP2 da_i * xd0;
|
||||
yd3 OP2 da_i * xd2;
|
||||
|
||||
ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, xd0, xd1);
|
||||
LD_GP2_INC(py, 1, yd0, yd1);
|
||||
|
||||
yd0 += da_r * xd0;
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd1 OP2 da_i * xd0;
|
||||
|
||||
ST_GP2_INC(yd0, yd1, y, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (1 == inc_y)
|
||||
{
|
||||
FLOAT *y_pref;
|
||||
BLASLONG pref_offset;
|
||||
v4f32 x8, x9, x10, x11, x12, x13, x14;
|
||||
|
||||
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
y_pref = y + pref_offset + 64;
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
dar_vec = COPY_FLOAT_TO_VECTOR(da_r);
|
||||
dai_vec = COPY_FLOAT_TO_VECTOR(da_i);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
PREF_OFFSET(y_pref, 0);
|
||||
PREF_OFFSET(y_pref, 32);
|
||||
PREF_OFFSET(y_pref, 64);
|
||||
PREF_OFFSET(y_pref, 96);
|
||||
y_pref += 32;
|
||||
|
||||
LD_SP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x14);
|
||||
LD_SP7_INC(x, inc_x2, x8, x9, x10, x11, x12, x13, x7);
|
||||
PCKEV_D2_SP(x1, x0, x3, x2, x0, x1);
|
||||
PCKEV_D2_SP(x5, x4, x14, x6, x2, x3);
|
||||
PCKEV_D2_SP(x9, x8, x11, x10, x4, x5);
|
||||
x6 = (v4f32) __msa_pckev_d((v2i64) x13, (v2i64) x12);
|
||||
x7 = (v4f32) __msa_insert_w((v4i32) x7, 2, *((int *) x));
|
||||
x7 = (v4f32) __msa_insert_w((v4i32) x7, 3, *((int *) (x + 1)));
|
||||
x += inc_x2;
|
||||
|
||||
LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
|
||||
PCKEVOD_W2_SP(x5, x4, x2r, x2i);
|
||||
PCKEVOD_W2_SP(y5, y4, y2r, y2i);
|
||||
PCKEVOD_W2_SP(x7, x6, x3r, x3i);
|
||||
PCKEVOD_W2_SP(y7, y6, y3r, y3i);
|
||||
|
||||
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y2i OP0 dar_vec * x2i;
|
||||
y3i OP0 dar_vec * x3i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y2r OP1 dai_vec * x2i;
|
||||
y3r OP1 dai_vec * x3i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
y2i OP2 dai_vec * x2r;
|
||||
y3i OP2 dai_vec * x3r;
|
||||
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1);
|
||||
ILVRL_W2_SP(y1i, y1r, y2, y3);
|
||||
ILVRL_W2_SP(y2i, y2r, y4, y5);
|
||||
ILVRL_W2_SP(y3i, y3r, y6, y7);
|
||||
|
||||
ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_SP7_INC(x, inc_x2, x0, x1, x2, x6, x4, x5, x3);
|
||||
PCKEV_D2_SP(x1, x0, x6, x2, x0, x1);
|
||||
|
||||
x2 = (v4f32) __msa_pckev_d((v2i64) x5, (v2i64) x4);
|
||||
x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x));
|
||||
x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) (x + 1)));
|
||||
x += inc_x2;
|
||||
|
||||
LD_SP4_INC(py, 4, y0, y1, y2, y3);
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_W2_SP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_W2_SP(y3, y2, y1r, y1i);
|
||||
|
||||
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1);
|
||||
ILVRL_W2_SP(y1i, y1r, y2, y3);
|
||||
ST_SP4_INC(y0, y1, y2, y3, y, 4);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_SP3_INC(x, inc_x2, x0, x2, x1);
|
||||
|
||||
x0 = (v4f32) __msa_pckev_d((v2i64) x2, (v2i64) x0);
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) (x + 1)));
|
||||
x += inc_x2;
|
||||
|
||||
LD_SP2_INC(py, 4, y0, y1);
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i);
|
||||
|
||||
y0r += dar_vec * x0r;
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1);
|
||||
ST_SP2_INC(y0, y1, y, 4);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
xd0 = x[0];
|
||||
xd1 = x[1];
|
||||
x += inc_x2;
|
||||
xd2 = x[0];
|
||||
xd3 = x[1];
|
||||
x += inc_x2;
|
||||
|
||||
LD_GP4_INC(py, 1, yd0, yd1, yd2, yd3);
|
||||
|
||||
FMADD2(xd0, xd2, da_r, yd0, yd2);
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd3 OP0 da_r * xd3;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd2 OP1 da_i * xd3;
|
||||
yd1 OP2 da_i * xd0;
|
||||
yd3 OP2 da_i * xd2;
|
||||
|
||||
ST_GP4_INC(yd0, yd1, yd2, yd3, y, 1);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, xd0, xd1);
|
||||
LD_GP2_INC(py, 1, yd0, yd1);
|
||||
|
||||
yd0 += da_r * xd0;
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd1 OP2 da_i * xd0;
|
||||
|
||||
ST_GP2_INC(yd0, yd1, y, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
xd0 = x[0];
|
||||
xd1 = x[1];
|
||||
x += inc_x2;
|
||||
xd2 = x[0];
|
||||
xd3 = x[1];
|
||||
x += inc_x2;
|
||||
xd4 = x[0];
|
||||
xd5 = x[1];
|
||||
x += inc_x2;
|
||||
xd6 = x[0];
|
||||
xd7 = x[1];
|
||||
x += inc_x2;
|
||||
|
||||
yd0 = py[0];
|
||||
yd1 = py[1];
|
||||
py += inc_y2;
|
||||
yd2 = py[0];
|
||||
yd3 = py[1];
|
||||
py += inc_y2;
|
||||
yd4 = py[0];
|
||||
yd5 = py[1];
|
||||
py += inc_y2;
|
||||
yd6 = py[0];
|
||||
yd7 = py[1];
|
||||
py += inc_y2;
|
||||
|
||||
FMADD4(xd0, xd2, xd4, xd6, da_r, yd0, yd2, yd4, yd6);
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd3 OP0 da_r * xd3;
|
||||
yd5 OP0 da_r * xd5;
|
||||
yd7 OP0 da_r * xd7;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd2 OP1 da_i * xd3;
|
||||
yd4 OP1 da_i * xd5;
|
||||
yd6 OP1 da_i * xd7;
|
||||
yd1 OP2 da_i * xd0;
|
||||
yd3 OP2 da_i * xd2;
|
||||
yd5 OP2 da_i * xd4;
|
||||
yd7 OP2 da_i * xd6;
|
||||
|
||||
y[0] = yd0;
|
||||
y[1] = yd1;
|
||||
y += inc_y2;
|
||||
y[0] = yd2;
|
||||
y[1] = yd3;
|
||||
y += inc_y2;
|
||||
y[0] = yd4;
|
||||
y[1] = yd5;
|
||||
y += inc_y2;
|
||||
y[0] = yd6;
|
||||
y[1] = yd7;
|
||||
y += inc_y2;
|
||||
}
|
||||
|
||||
if (n & 3)
|
||||
{
|
||||
if (n & 2)
|
||||
{
|
||||
xd0 = x[0];
|
||||
xd1 = x[1];
|
||||
x += inc_x2;
|
||||
xd2 = x[0];
|
||||
xd3 = x[1];
|
||||
x += inc_x2;
|
||||
|
||||
yd0 = py[0];
|
||||
yd1 = py[1];
|
||||
py += inc_y2;
|
||||
yd2 = py[0];
|
||||
yd3 = py[1];
|
||||
py += inc_y2;
|
||||
|
||||
FMADD2(xd0, xd2, da_r, yd0, yd2);
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd3 OP0 da_r * xd3;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd2 OP1 da_i * xd3;
|
||||
yd1 OP2 da_i * xd0;
|
||||
yd3 OP2 da_i * xd2;
|
||||
|
||||
y[0] = yd0;
|
||||
y[1] = yd1;
|
||||
y += inc_y2;
|
||||
y[0] = yd2;
|
||||
y[1] = yd3;
|
||||
y += inc_y2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
xd0 = x[0];
|
||||
xd1 = x[1];
|
||||
|
||||
yd0 = y[0];
|
||||
yd1 = y[1];
|
||||
|
||||
yd0 += da_r * xd0;
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd1 OP2 da_i * xd0;
|
||||
|
||||
y[0] = yd0;
|
||||
y[1] = yd1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,201 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i, inc_x2, inc_y2;
|
||||
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
|
||||
if (n < 0) return (0);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
if (n > 31)
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 128 + 32;
|
||||
|
||||
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
for (i = (n >> 5) - 1; i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(x_pref, 128);
|
||||
PREF_OFFSET(x_pref, 160);
|
||||
PREF_OFFSET(x_pref, 192);
|
||||
PREF_OFFSET(x_pref, 224);
|
||||
x_pref += 64;
|
||||
|
||||
x8 = LD_SP(x); x += 4;
|
||||
ST_SP(x0, y); y += 4;
|
||||
x9 = LD_SP(x); x += 4;
|
||||
ST_SP(x1, y); y += 4;
|
||||
x10 = LD_SP(x); x += 4;
|
||||
ST_SP(x2, y); y += 4;
|
||||
x11 = LD_SP(x); x += 4;
|
||||
ST_SP(x3, y); y += 4;
|
||||
x12 = LD_SP(x); x += 4;
|
||||
ST_SP(x4, y); y += 4;
|
||||
x13 = LD_SP(x); x += 4;
|
||||
ST_SP(x5, y); y += 4;
|
||||
x14 = LD_SP(x); x += 4;
|
||||
ST_SP(x6, y); y += 4;
|
||||
x15 = LD_SP(x); x += 4;
|
||||
ST_SP(x7, y); y += 4;
|
||||
x0 = LD_SP(x); x += 4;
|
||||
ST_SP(x8, y); y += 4;
|
||||
x1 = LD_SP(x); x += 4;
|
||||
ST_SP(x9, y); y += 4;
|
||||
x2 = LD_SP(x); x += 4;
|
||||
ST_SP(x10, y); y += 4;
|
||||
x3 = LD_SP(x); x += 4;
|
||||
ST_SP(x11, y); y += 4;
|
||||
x4 = LD_SP(x); x += 4;
|
||||
ST_SP(x12, y); y += 4;
|
||||
x5 = LD_SP(x); x += 4;
|
||||
ST_SP(x13, y); y += 4;
|
||||
x6 = LD_SP(x); x += 4;
|
||||
ST_SP(x14, y); y += 4;
|
||||
x7 = LD_SP(x); x += 4;
|
||||
ST_SP(x15, y); y += 4;
|
||||
}
|
||||
|
||||
x8 = LD_SP(x); x += 4;
|
||||
x9 = LD_SP(x); x += 4;
|
||||
ST_SP(x0, y); y += 4;
|
||||
x10 = LD_SP(x); x += 4;
|
||||
ST_SP(x1, y); y += 4;
|
||||
x11 = LD_SP(x); x += 4;
|
||||
ST_SP(x2, y); y += 4;
|
||||
x12 = LD_SP(x); x += 4;
|
||||
ST_SP(x3, y); y += 4;
|
||||
x13 = LD_SP(x); x += 4;
|
||||
ST_SP(x4, y); y += 4;
|
||||
x14 = LD_SP(x); x += 4;
|
||||
ST_SP(x5, y); y += 4;
|
||||
x15 = LD_SP(x); x += 4;
|
||||
ST_SP(x6, y); y += 4;
|
||||
ST_SP(x7, y); y += 4;
|
||||
|
||||
ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4);
|
||||
}
|
||||
|
||||
if (n & 31)
|
||||
{
|
||||
if (n & 16)
|
||||
{
|
||||
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4);
|
||||
}
|
||||
|
||||
if (n & 8)
|
||||
{
|
||||
LD_SP4_INC(x, 4, x0, x1, x2, x3);
|
||||
ST_SP4_INC(x0, x1, x2, x3, y, 4);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_SP2_INC(x, 4, x0, x1);
|
||||
ST_SP2_INC(x0, x1, y, 4);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP4_INC(x, 1, f0, f1, f2, f3);
|
||||
ST_GP4_INC(f0, f1, f2, f3, y, 1);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, f0, f1);
|
||||
ST_GP2_INC(f0, f1, y, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
f0 = *x;
|
||||
f1 = *(x+1); x += inc_x2;
|
||||
f2 = *x;
|
||||
f3 = *(x+1); x += inc_x2;
|
||||
f4 = *x;
|
||||
f5 = *(x+1); x += inc_x2;
|
||||
f6 = *x;
|
||||
f7 = *(x+1); x += inc_x2;
|
||||
|
||||
*y = f0;
|
||||
*(y+1) = f1; y += inc_y2;
|
||||
*y = f2;
|
||||
*(y+1) = f3; y += inc_y2;
|
||||
*y = f4;
|
||||
*(y+1) = f5; y += inc_y2;
|
||||
*y = f6;
|
||||
*(y+1) = f7; y += inc_y2;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
f0 = *x;
|
||||
f1 = *(x+1); x += inc_x2;
|
||||
f2 = *x;
|
||||
f3 = *(x+1); x += inc_x2;
|
||||
|
||||
*y = f0;
|
||||
*(y+1) = f1; y += inc_y2;
|
||||
*y = f2;
|
||||
*(y+1) = f3; y += inc_y2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, f0, f1);
|
||||
ST_GP2_INC(f0, f1, y, 1);
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,281 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||
FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy,
|
||||
BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i = 0, pref_offsetx, pref_offsety;
|
||||
FLOAT *px, *py;
|
||||
BLASLONG inc_x2, inc_y2;
|
||||
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
|
||||
v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
|
||||
|
||||
if (n < 0) return (0);
|
||||
|
||||
pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offsetx > 0)
|
||||
{
|
||||
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
|
||||
pref_offsetx = pref_offsetx / sizeof(FLOAT);
|
||||
}
|
||||
|
||||
pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offsety > 0)
|
||||
{
|
||||
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
|
||||
pref_offsety = pref_offsety / sizeof(FLOAT);
|
||||
}
|
||||
|
||||
px = srcx;
|
||||
py = srcy;
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
if (n >> 4)
|
||||
{
|
||||
LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
|
||||
|
||||
for (i = (n >> 4) - 1; i--;)
|
||||
{
|
||||
PREFETCH(px + pref_offsetx + 32);
|
||||
PREFETCH(px + pref_offsetx + 40);
|
||||
PREFETCH(px + pref_offsetx + 48);
|
||||
PREFETCH(px + pref_offsetx + 56);
|
||||
|
||||
PREFETCH(py + pref_offsety + 32);
|
||||
PREFETCH(py + pref_offsety + 40);
|
||||
PREFETCH(py + pref_offsety + 48);
|
||||
PREFETCH(py + pref_offsety + 56);
|
||||
|
||||
yv0 = LD_SP(py); py += 4;
|
||||
ST_SP(xv0, srcy); srcy += 4;
|
||||
yv1 = LD_SP(py); py += 4;
|
||||
ST_SP(xv1, srcy); srcy += 4;
|
||||
yv2 = LD_SP(py); py += 4;
|
||||
ST_SP(xv2, srcy); srcy += 4;
|
||||
yv3 = LD_SP(py); py += 4;
|
||||
ST_SP(xv3, srcy); srcy += 4;
|
||||
yv4 = LD_SP(py); py += 4;
|
||||
ST_SP(xv4, srcy); srcy += 4;
|
||||
yv5 = LD_SP(py); py += 4;
|
||||
ST_SP(xv5, srcy); srcy += 4;
|
||||
yv6 = LD_SP(py); py += 4;
|
||||
ST_SP(xv6, srcy); srcy += 4;
|
||||
yv7 = LD_SP(py); py += 4;
|
||||
ST_SP(xv7, srcy); srcy += 4;
|
||||
|
||||
xv0 = LD_SP(px); px += 4;
|
||||
ST_SP(yv0, srcx); srcx += 4;
|
||||
xv1 = LD_SP(px); px += 4;
|
||||
ST_SP(yv1, srcx); srcx += 4;
|
||||
xv2 = LD_SP(px); px += 4;
|
||||
ST_SP(yv2, srcx); srcx += 4;
|
||||
xv3 = LD_SP(px); px += 4;
|
||||
ST_SP(yv3, srcx); srcx += 4;
|
||||
xv4 = LD_SP(px); px += 4;
|
||||
ST_SP(yv4, srcx); srcx += 4;
|
||||
xv5 = LD_SP(px); px += 4;
|
||||
ST_SP(yv5, srcx); srcx += 4;
|
||||
xv6 = LD_SP(px); px += 4;
|
||||
ST_SP(yv6, srcx); srcx += 4;
|
||||
xv7 = LD_SP(px); px += 4;
|
||||
ST_SP(yv7, srcx); srcx += 4;
|
||||
}
|
||||
|
||||
LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
|
||||
ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4);
|
||||
ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4) && (n & 2))
|
||||
{
|
||||
LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
|
||||
LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
|
||||
ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4);
|
||||
ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4);
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5);
|
||||
LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5);
|
||||
ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4);
|
||||
ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4);
|
||||
}
|
||||
else if ((n & 8) && (n & 2))
|
||||
{
|
||||
LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4);
|
||||
LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4);
|
||||
ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4);
|
||||
ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_SP3_INC(px, 4, xv0, xv1, xv2);
|
||||
LD_SP3_INC(py, 4, yv0, yv1, yv2);
|
||||
ST_SP3_INC(xv0, xv1, xv2, srcy, 4);
|
||||
ST_SP3_INC(yv0, yv1, yv2, srcx, 4);
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3);
|
||||
LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3);
|
||||
ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4);
|
||||
ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_SP2_INC(px, 4, xv0, xv1);
|
||||
LD_SP2_INC(py, 4, yv0, yv1);
|
||||
ST_SP2_INC(xv0, xv1, srcy, 4);
|
||||
ST_SP2_INC(yv0, yv1, srcx, 4);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
xv0 = LD_SP(px);
|
||||
yv0 = LD_SP(py);
|
||||
|
||||
px += 4;
|
||||
py += 4;
|
||||
|
||||
ST_SP(xv0, srcy);
|
||||
ST_SP(yv0, srcx);
|
||||
|
||||
srcx += 4;
|
||||
srcy += 4;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(px, 1, x0, x1);
|
||||
LD_GP2_INC(py, 1, y0, y1);
|
||||
ST_GP2_INC(x0, x1, srcy, 1);
|
||||
ST_GP2_INC(y0, y1, srcx, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
x0 = srcx[0 * inc_x2];
|
||||
x1 = srcx[0 * inc_x2 + 1];
|
||||
x2 = srcx[1 * inc_x2];
|
||||
x3 = srcx[1 * inc_x2 + 1];
|
||||
x4 = srcx[2 * inc_x2];
|
||||
x5 = srcx[2 * inc_x2 + 1];
|
||||
x6 = srcx[3 * inc_x2];
|
||||
x7 = srcx[3 * inc_x2 + 1];
|
||||
|
||||
y0 = srcy[0 * inc_y2];
|
||||
y1 = srcy[0 * inc_y2 + 1];
|
||||
y2 = srcy[1 * inc_y2];
|
||||
y3 = srcy[1 * inc_y2 + 1];
|
||||
y4 = srcy[2 * inc_y2];
|
||||
y5 = srcy[2 * inc_y2 + 1];
|
||||
y6 = srcy[3 * inc_y2];
|
||||
y7 = srcy[3 * inc_y2 + 1];
|
||||
|
||||
srcx[0 * inc_x2] = y0;
|
||||
srcx[0 * inc_x2 + 1] = y1;
|
||||
srcx[1 * inc_x2] = y2;
|
||||
srcx[1 * inc_x2 + 1] = y3;
|
||||
srcx[2 * inc_x2] = y4;
|
||||
srcx[2 * inc_x2 + 1] = y5;
|
||||
srcx[3 * inc_x2] = y6;
|
||||
srcx[3 * inc_x2 + 1] = y7;
|
||||
|
||||
srcy[0 * inc_y2] = x0;
|
||||
srcy[0 * inc_y2 + 1] = x1;
|
||||
srcy[1 * inc_y2] = x2;
|
||||
srcy[1 * inc_y2 + 1] = x3;
|
||||
srcy[2 * inc_y2] = x4;
|
||||
srcy[2 * inc_y2 + 1] = x5;
|
||||
srcy[3 * inc_y2] = x6;
|
||||
srcy[3 * inc_y2 + 1] = x7;
|
||||
|
||||
srcx += 4 * inc_x2;
|
||||
srcy += 4 * inc_y2;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
x0 = srcx[0 * inc_x2];
|
||||
x1 = srcx[0 * inc_x2 + 1];
|
||||
x2 = srcx[1 * inc_x2];
|
||||
x3 = srcx[1 * inc_x2 + 1];
|
||||
|
||||
y0 = srcy[0 * inc_y2];
|
||||
y1 = srcy[0 * inc_y2 + 1];
|
||||
y2 = srcy[1 * inc_y2];
|
||||
y3 = srcy[1 * inc_y2 + 1];
|
||||
|
||||
srcx[0 * inc_x2] = y0;
|
||||
srcx[0 * inc_x2 + 1] = y1;
|
||||
srcx[1 * inc_x2] = y2;
|
||||
srcx[1 * inc_x2 + 1] = y3;
|
||||
|
||||
srcy[0 * inc_y2] = x0;
|
||||
srcy[0 * inc_y2 + 1] = x1;
|
||||
srcy[1 * inc_y2] = x2;
|
||||
srcy[1 * inc_y2 + 1] = x3;
|
||||
|
||||
srcx += 2 * inc_x2;
|
||||
srcy += 2 * inc_y2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
x0 = srcx[0 * inc_x2];
|
||||
x1 = srcx[0 * inc_x2 + 1];
|
||||
|
||||
y0 = srcy[0 * inc_y2];
|
||||
y1 = srcy[0 * inc_y2 + 1];
|
||||
|
||||
srcx[0 * inc_x2] = y0;
|
||||
srcx[0 * inc_x2 + 1] = y1;
|
||||
|
||||
srcy[0 * inc_y2] = x0;
|
||||
srcy[0 * inc_y2 + 1] = x1;
|
||||
|
||||
srcx += inc_x2;
|
||||
srcy += inc_y2;
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,246 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#if !defined(CONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||
BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *py;
|
||||
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
v2f64 da_vec, zero_v = {0};
|
||||
|
||||
if ((n < 0) || (da == 0.0)) return(0);
|
||||
|
||||
py = y;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
FLOAT *x_pref, *y_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 32;
|
||||
|
||||
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
y_pref = y + pref_offset + 32;
|
||||
|
||||
da_vec = COPY_DOUBLE_TO_VECTOR(da);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(y_pref, 0);
|
||||
PREF_OFFSET(y_pref, 32);
|
||||
PREF_OFFSET(y_pref, 64);
|
||||
PREF_OFFSET(y_pref, 96);
|
||||
x_pref += 16;
|
||||
y_pref += 16;
|
||||
|
||||
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
|
||||
FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
|
||||
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP4_INC(x, 2, x0, x1, x2, x3);
|
||||
LD_DP4_INC(py, 2, y0, y1, y2, y3);
|
||||
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
|
||||
ST_DP4_INC(y0, y1, y2, y3, y, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(x, 2, x0, x1);
|
||||
LD_DP2_INC(py, 2, y0, y1);
|
||||
FMADD2(x0, x1, da_vec, y0, y1);
|
||||
ST_DP2_INC(y0, y1, y, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
x0 = LD_DP(x); x += 2;
|
||||
y0 = LD_DP(py); py += 2;
|
||||
y0 += da_vec * x0;
|
||||
ST_DP(y0, y); y += 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
y[0] += da * x[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (1 == inc_y)
|
||||
{
|
||||
FLOAT *y_pref;
|
||||
BLASLONG pref_offset;
|
||||
v2f64 x8, x9, x10, x11, x12, x13, x14;
|
||||
|
||||
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
y_pref = y + pref_offset + 32;
|
||||
|
||||
da_vec = COPY_DOUBLE_TO_VECTOR(da);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
PREF_OFFSET(y_pref, 0);
|
||||
PREF_OFFSET(y_pref, 32);
|
||||
PREF_OFFSET(y_pref, 64);
|
||||
PREF_OFFSET(y_pref, 96);
|
||||
y_pref += 16;
|
||||
|
||||
LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x14);
|
||||
LD_DP7_INC(x, inc_x, x8, x9, x10, x11, x12, x13, x7);
|
||||
|
||||
PCKEV_D2_SD(x1, x0, x3, x2, x0, x1);
|
||||
PCKEV_D2_SD(x5, x4, x14, x6, x2, x3);
|
||||
PCKEV_D2_SD(x9, x8, x11, x10, x4, x5);
|
||||
x6 = (v2f64) __msa_pckev_d((v2i64) x13, (v2i64) x12);
|
||||
x7 = (v2f64) __msa_insert_d((v2i64) x7, 1, *((BLASLONG *) x));
|
||||
x += inc_x;
|
||||
|
||||
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
|
||||
FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
|
||||
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP7_INC(x, inc_x, x0, x1, x2, x6, x4, x5, x3);
|
||||
|
||||
PCKEV_D2_SD(x1, x0, x6, x2, x0, x1);
|
||||
x2 = (v2f64) __msa_pckev_d((v2i64) x5, (v2i64) x4);
|
||||
x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((BLASLONG *) x));
|
||||
x += inc_x;
|
||||
|
||||
LD_DP4_INC(py, 2, y0, y1, y2, y3);
|
||||
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
|
||||
ST_DP4_INC(y0, y1, y2, y3, y, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP3_INC(x, inc_x, x0, x2, x1);
|
||||
|
||||
x0 = (v2f64) __msa_pckev_d((v2i64) x2, (v2i64) x0);
|
||||
x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((BLASLONG *) x));
|
||||
x += inc_x;
|
||||
|
||||
LD_DP2_INC(py, 2, y0, y1);
|
||||
FMADD2(x0, x1, da_vec, y0, y1);
|
||||
ST_DP2_INC(y0, y1, y, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) zero_v, 0, *((BLASLONG *) x));
|
||||
x += inc_x;
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((BLASLONG *) x));
|
||||
x += inc_x;
|
||||
|
||||
y0 = LD_DP(py); py += 2;
|
||||
y0 += da_vec * x0;
|
||||
ST_DP(y0, y); y += 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
y[0] += da * x[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
|
||||
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
|
||||
FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3);
|
||||
ST_GP4_INC(y0, y1, y2, y3, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 3)
|
||||
{
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, inc_x, x0, x1);
|
||||
LD_GP2_INC(py, inc_y, y0, y1);
|
||||
FMADD2(x0, x1, da, y0, y1);
|
||||
ST_GP2_INC(y0, y1, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*y += da * *x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,180 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i;
|
||||
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
|
||||
if (n < 0) return (0);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
if (n > 31)
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 64 + 16;
|
||||
|
||||
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
for (i = (n >> 5) - 1; i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(x_pref, 128);
|
||||
PREF_OFFSET(x_pref, 160);
|
||||
PREF_OFFSET(x_pref, 192);
|
||||
PREF_OFFSET(x_pref, 224);
|
||||
x_pref += 32;
|
||||
|
||||
x8 = LD_DP(x); x += 2;
|
||||
ST_DP(x0, y); y += 2;
|
||||
x9 = LD_DP(x); x += 2;
|
||||
ST_DP(x1, y); y += 2;
|
||||
x10 = LD_DP(x); x += 2;
|
||||
ST_DP(x2, y); y += 2;
|
||||
x11 = LD_DP(x); x += 2;
|
||||
ST_DP(x3, y); y += 2;
|
||||
x12 = LD_DP(x); x += 2;
|
||||
ST_DP(x4, y); y += 2;
|
||||
x13 = LD_DP(x); x += 2;
|
||||
ST_DP(x5, y); y += 2;
|
||||
x14 = LD_DP(x); x += 2;
|
||||
ST_DP(x6, y); y += 2;
|
||||
x15 = LD_DP(x); x += 2;
|
||||
ST_DP(x7, y); y += 2;
|
||||
x0 = LD_DP(x); x += 2;
|
||||
ST_DP(x8, y); y += 2;
|
||||
x1 = LD_DP(x); x += 2;
|
||||
ST_DP(x9, y); y += 2;
|
||||
x2 = LD_DP(x); x += 2;
|
||||
ST_DP(x10, y); y += 2;
|
||||
x3 = LD_DP(x); x += 2;
|
||||
ST_DP(x11, y); y += 2;
|
||||
x4 = LD_DP(x); x += 2;
|
||||
ST_DP(x12, y); y += 2;
|
||||
x5 = LD_DP(x); x += 2;
|
||||
ST_DP(x13, y); y += 2;
|
||||
x6 = LD_DP(x); x += 2;
|
||||
ST_DP(x14, y); y += 2;
|
||||
x7 = LD_DP(x); x += 2;
|
||||
ST_DP(x15, y); y += 2;
|
||||
}
|
||||
|
||||
x8 = LD_DP(x); x += 2;
|
||||
x9 = LD_DP(x); x += 2;
|
||||
ST_DP(x0, y); y += 2;
|
||||
x10 = LD_DP(x); x += 2;
|
||||
ST_DP(x1, y); y += 2;
|
||||
x11 = LD_DP(x); x += 2;
|
||||
ST_DP(x2, y); y += 2;
|
||||
x12 = LD_DP(x); x += 2;
|
||||
ST_DP(x3, y); y += 2;
|
||||
x13 = LD_DP(x); x += 2;
|
||||
ST_DP(x4, y); y += 2;
|
||||
x14 = LD_DP(x); x += 2;
|
||||
ST_DP(x5, y); y += 2;
|
||||
x15 = LD_DP(x); x += 2;
|
||||
ST_DP(x6, y); y += 2;
|
||||
ST_DP(x7, y); y += 2;
|
||||
|
||||
ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2);
|
||||
}
|
||||
|
||||
if (n & 31)
|
||||
{
|
||||
if (n & 16)
|
||||
{
|
||||
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2);
|
||||
}
|
||||
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP4_INC(x, 2, x0, x1, x2, x3);
|
||||
ST_DP4_INC(x0, x1, x2, x3, y, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_GP4_INC(x, 1, f0, f1, f2, f3);
|
||||
ST_GP4_INC(f0, f1, f2, f3, y, 1);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, 1, f0, f1);
|
||||
ST_GP2_INC(f0, f1, y, 1);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
|
||||
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_GP4_INC(x, inc_x, f0, f1, f2, f3);
|
||||
ST_GP4_INC(f0, f1, f2, f3, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, inc_x, f0, f1);
|
||||
ST_GP2_INC(f0, f1, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,368 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||
BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *px;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
|
||||
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
v2f64 da_vec;
|
||||
|
||||
px = x;
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
if (0.0 == da)
|
||||
{
|
||||
v2f64 zero_v = __msa_cast_to_vector_double(0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
|
||||
|
||||
for (i = (n >> 5); i--;)
|
||||
{
|
||||
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, 2);
|
||||
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, 2);
|
||||
}
|
||||
|
||||
if (n & 31)
|
||||
{
|
||||
if (n & 16)
|
||||
{
|
||||
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, 2);
|
||||
}
|
||||
|
||||
if (n & 8)
|
||||
{
|
||||
ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
ST_DP2_INC(zero_v, zero_v, x, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
*x = 0; x += 1;
|
||||
*x = 0; x += 1;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*x = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
da_vec = COPY_DOUBLE_TO_VECTOR(da);
|
||||
|
||||
if (n > 31)
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 32 + 16;
|
||||
|
||||
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
for (i = 0; i < (n >> 5) - 1; i++)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(x_pref, 128);
|
||||
PREF_OFFSET(x_pref, 160);
|
||||
PREF_OFFSET(x_pref, 192);
|
||||
PREF_OFFSET(x_pref, 224);
|
||||
x_pref += 32;
|
||||
|
||||
x8 = LD_DP(px); px += 2;
|
||||
x0 *= da_vec;
|
||||
x9 = LD_DP(px); px += 2;
|
||||
x1 *= da_vec;
|
||||
x10 = LD_DP(px); px += 2;
|
||||
x2 *= da_vec;
|
||||
x11 = LD_DP(px); px += 2;
|
||||
x3 *= da_vec;
|
||||
x12 = LD_DP(px); px += 2;
|
||||
x4 *= da_vec;
|
||||
x13 = LD_DP(px); px += 2;
|
||||
x5 *= da_vec;
|
||||
x14 = LD_DP(px); px += 2;
|
||||
x6 *= da_vec;
|
||||
x15 = LD_DP(px); px += 2;
|
||||
x7 *= da_vec;
|
||||
x8 *= da_vec;
|
||||
ST_DP(x0, x); x += 2;
|
||||
x9 *= da_vec;
|
||||
ST_DP(x1, x); x += 2;
|
||||
x10 *= da_vec;
|
||||
ST_DP(x2, x); x += 2;
|
||||
x11 *= da_vec;
|
||||
ST_DP(x3, x); x += 2;
|
||||
x12 *= da_vec;
|
||||
ST_DP(x4, x); x += 2;
|
||||
x13 *= da_vec;
|
||||
ST_DP(x5, x); x += 2;
|
||||
x14 *= da_vec;
|
||||
ST_DP(x6, x); x += 2;
|
||||
x15 *= da_vec;
|
||||
ST_DP(x7, x); x += 2;
|
||||
ST_DP(x8, x); x += 2;
|
||||
x0 = LD_DP(px); px += 2;
|
||||
ST_DP(x9, x); x += 2;
|
||||
x1 = LD_DP(px); px += 2;
|
||||
ST_DP(x10, x); x += 2;
|
||||
x2 = LD_DP(px); px += 2;
|
||||
ST_DP(x11, x); x += 2;
|
||||
x3 = LD_DP(px); px += 2;
|
||||
ST_DP(x12, x); x += 2;
|
||||
x4 = LD_DP(px); px += 2;
|
||||
ST_DP(x13, x); x += 2;
|
||||
x5 = LD_DP(px); px += 2;
|
||||
ST_DP(x14, x); x += 2;
|
||||
x6 = LD_DP(px); px += 2;
|
||||
ST_DP(x15, x); x += 2;
|
||||
x7 = LD_DP(px); px += 2;
|
||||
}
|
||||
|
||||
x8 = LD_DP(px); px += 2;
|
||||
x0 *= da_vec;
|
||||
x9 = LD_DP(px); px += 2;
|
||||
x1 *= da_vec;
|
||||
x10 = LD_DP(px); px += 2;
|
||||
x2 *= da_vec;
|
||||
x11 = LD_DP(px); px += 2;
|
||||
x3 *= da_vec;
|
||||
x12 = LD_DP(px); px += 2;
|
||||
x4 *= da_vec;
|
||||
x13 = LD_DP(px); px += 2;
|
||||
x5 *= da_vec;
|
||||
x14 = LD_DP(px); px += 2;
|
||||
x6 *= da_vec;
|
||||
x15 = LD_DP(px); px += 2;
|
||||
x7 *= da_vec;
|
||||
x8 *= da_vec;
|
||||
ST_DP(x0, x); x += 2;
|
||||
x9 *= da_vec;
|
||||
ST_DP(x1, x); x += 2;
|
||||
x10 *= da_vec;
|
||||
ST_DP(x2, x); x += 2;
|
||||
x11 *= da_vec;
|
||||
ST_DP(x3, x); x += 2;
|
||||
x12 *= da_vec;
|
||||
ST_DP(x4, x); x += 2;
|
||||
x13 *= da_vec;
|
||||
ST_DP(x5, x); x += 2;
|
||||
x15 *= da_vec;
|
||||
ST_DP(x6, x); x += 2;
|
||||
x14 *= da_vec;
|
||||
ST_DP(x7, x); x += 2;
|
||||
|
||||
ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 2);
|
||||
}
|
||||
|
||||
if (n & 31)
|
||||
{
|
||||
if (n & 16)
|
||||
{
|
||||
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
|
||||
MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
|
||||
}
|
||||
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP4_INC(px, 2, x0, x1, x2, x3);
|
||||
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
|
||||
ST_DP4_INC(x0, x1, x2, x3, x, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(px, 2, x0, x1);
|
||||
MUL2(x0, da_vec, x1, da_vec, x0, x1);
|
||||
ST_DP2_INC(x0, x1, x, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(px, 1, f0, f1);
|
||||
MUL2(f0, da, f1, da, f0, f1);
|
||||
ST_GP2_INC(f0, f1, x, 1);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*x *= da;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (da == 0.0)
|
||||
{
|
||||
for (i = n; i--;)
|
||||
{
|
||||
*x = 0.0;
|
||||
|
||||
x += inc_x;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (n > 15)
|
||||
{
|
||||
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
|
||||
for (i = 0; i < (n >> 4) - 1; i++)
|
||||
{
|
||||
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
|
||||
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
|
||||
|
||||
f4 *= da;
|
||||
f5 *= da;
|
||||
*x = f0; x += inc_x;
|
||||
f6 *= da;
|
||||
*x = f1; x += inc_x;
|
||||
f7 *= da;
|
||||
*x = f2; x += inc_x;
|
||||
f8 *= da;
|
||||
*x = f3; x += inc_x;
|
||||
f9 *= da;
|
||||
*x = f4; x += inc_x;
|
||||
f10 *= da;
|
||||
*x = f5; x += inc_x;
|
||||
f11 *= da;
|
||||
*x = f6; x += inc_x;
|
||||
f12 *= da;
|
||||
*x = f7; x += inc_x;
|
||||
f13 *= da;
|
||||
*x = f8; x += inc_x;
|
||||
f14 *= da;
|
||||
*x = f9; x += inc_x;
|
||||
f15 *= da;
|
||||
*x = f10; x += inc_x;
|
||||
*x = f11; x += inc_x;
|
||||
f0 = *px; px += inc_x;
|
||||
*x = f12; x += inc_x;
|
||||
f1 = *px; px += inc_x;
|
||||
*x = f13; x += inc_x;
|
||||
f2 = *px; px += inc_x;
|
||||
*x = f14; x += inc_x;
|
||||
f3 = *px; px += inc_x;
|
||||
*x = f15; x += inc_x;
|
||||
f4 = *px; px += inc_x;
|
||||
f5 = *px; px += inc_x;
|
||||
f6 = *px; px += inc_x;
|
||||
f7 = *px; px += inc_x;
|
||||
}
|
||||
|
||||
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
|
||||
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
|
||||
|
||||
f4 *= da;
|
||||
f5 *= da;
|
||||
*x = f0; x += inc_x;
|
||||
f6 *= da;
|
||||
*x = f1; x += inc_x;
|
||||
f7 *= da;
|
||||
*x = f2; x += inc_x;
|
||||
f8 *= da;
|
||||
*x = f3; x += inc_x;
|
||||
f9 *= da;
|
||||
*x = f4; x += inc_x;
|
||||
f10 *= da;
|
||||
*x = f5; x += inc_x;
|
||||
f11 *= da;
|
||||
*x = f6; x += inc_x;
|
||||
f12 *= da;
|
||||
*x = f7; x += inc_x;
|
||||
f13 *= da;
|
||||
*x = f8; x += inc_x;
|
||||
f14 *= da;
|
||||
*x = f9; x += inc_x;
|
||||
f15 *= da;
|
||||
*x = f10; x += inc_x;
|
||||
*x = f11; x += inc_x;
|
||||
*x = f12; x += inc_x;
|
||||
*x = f13; x += inc_x;
|
||||
*x = f14; x += inc_x;
|
||||
*x = f15; x += inc_x;
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
|
||||
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
|
||||
MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
|
||||
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
|
||||
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
|
||||
ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(px, inc_x, f0, f1);
|
||||
MUL2(f0, da, f1, da, f0, f1);
|
||||
ST_GP2_INC(f0, f1, x, inc_x);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*x *= da;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,253 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||
FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
|
||||
FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i = 0, pref_offsetx, pref_offsety;
|
||||
FLOAT *px, *py;
|
||||
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
v2f64 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
|
||||
v2f64 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
|
||||
|
||||
if (n < 0) return (0);
|
||||
|
||||
pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offsetx > 0)
|
||||
{
|
||||
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
|
||||
pref_offsetx = pref_offsetx / sizeof(FLOAT);
|
||||
}
|
||||
|
||||
pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offsety > 0)
|
||||
{
|
||||
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
|
||||
pref_offsety = pref_offsety / sizeof(FLOAT);
|
||||
}
|
||||
|
||||
px = srcx;
|
||||
py = srcy;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
if (n >> 4)
|
||||
{
|
||||
LD_DP8_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
|
||||
|
||||
for (i = (n >> 4) - 1; i--;)
|
||||
{
|
||||
PREFETCH(px + pref_offsetx + 16);
|
||||
PREFETCH(px + pref_offsetx + 20);
|
||||
PREFETCH(px + pref_offsetx + 24);
|
||||
PREFETCH(px + pref_offsetx + 28);
|
||||
|
||||
PREFETCH(py + pref_offsety + 16);
|
||||
PREFETCH(py + pref_offsety + 20);
|
||||
PREFETCH(py + pref_offsety + 24);
|
||||
PREFETCH(py + pref_offsety + 28);
|
||||
|
||||
yv0 = LD_DP(py); py += 2;
|
||||
ST_DP(xv0, srcy); srcy += 2;
|
||||
yv1 = LD_DP(py); py += 2;
|
||||
ST_DP(xv1, srcy); srcy += 2;
|
||||
yv2 = LD_DP(py); py += 2;
|
||||
ST_DP(xv2, srcy); srcy += 2;
|
||||
yv3 = LD_DP(py); py += 2;
|
||||
ST_DP(xv3, srcy); srcy += 2;
|
||||
yv4 = LD_DP(py); py += 2;
|
||||
ST_DP(xv4, srcy); srcy += 2;
|
||||
yv5 = LD_DP(py); py += 2;
|
||||
ST_DP(xv5, srcy); srcy += 2;
|
||||
yv6 = LD_DP(py); py += 2;
|
||||
ST_DP(xv6, srcy); srcy += 2;
|
||||
yv7 = LD_DP(py); py += 2;
|
||||
ST_DP(xv7, srcy); srcy += 2;
|
||||
|
||||
xv0 = LD_DP(px); px += 2;
|
||||
ST_DP(yv0, srcx); srcx += 2;
|
||||
xv1 = LD_DP(px); px += 2;
|
||||
ST_DP(yv1, srcx); srcx += 2;
|
||||
xv2 = LD_DP(px); px += 2;
|
||||
ST_DP(yv2, srcx); srcx += 2;
|
||||
xv3 = LD_DP(px); px += 2;
|
||||
ST_DP(yv3, srcx); srcx += 2;
|
||||
xv4 = LD_DP(px); px += 2;
|
||||
ST_DP(yv4, srcx); srcx += 2;
|
||||
xv5 = LD_DP(px); px += 2;
|
||||
ST_DP(yv5, srcx); srcx += 2;
|
||||
xv6 = LD_DP(px); px += 2;
|
||||
ST_DP(yv6, srcx); srcx += 2;
|
||||
xv7 = LD_DP(px); px += 2;
|
||||
ST_DP(yv7, srcx); srcx += 2;
|
||||
}
|
||||
|
||||
LD_DP8_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
|
||||
ST_DP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 2);
|
||||
ST_DP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP7_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
|
||||
LD_DP7_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
|
||||
ST_DP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 2);
|
||||
ST_DP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 2);
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_DP6_INC(px, 2, xv0, xv1, xv2, xv3, xv4, xv5);
|
||||
LD_DP6_INC(py, 2, yv0, yv1, yv2, yv3, yv4, yv5);
|
||||
ST_DP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 2);
|
||||
ST_DP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 2);
|
||||
}
|
||||
else if ((n & 8) && (n & 2))
|
||||
{
|
||||
LD_DP5_INC(px, 2, xv0, xv1, xv2, xv3, xv4);
|
||||
LD_DP5_INC(py, 2, yv0, yv1, yv2, yv3, yv4);
|
||||
ST_DP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 2);
|
||||
ST_DP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 2);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP3_INC(px, 2, xv0, xv1, xv2);
|
||||
LD_DP3_INC(py, 2, yv0, yv1, yv2);
|
||||
ST_DP3_INC(xv0, xv1, xv2, srcy, 2);
|
||||
ST_DP3_INC(yv0, yv1, yv2, srcx, 2);
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_DP4_INC(px, 2, xv0, xv1, xv2, xv3);
|
||||
LD_DP4_INC(py, 2, yv0, yv1, yv2, yv3);
|
||||
ST_DP4_INC(xv0, xv1, xv2, xv3, srcy, 2);
|
||||
ST_DP4_INC(yv0, yv1, yv2, yv3, srcx, 2);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(px, 2, xv0, xv1);
|
||||
LD_DP2_INC(py, 2, yv0, yv1);
|
||||
ST_DP2_INC(xv0, xv1, srcy, 2);
|
||||
ST_DP2_INC(yv0, yv1, srcx, 2);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
xv0 = LD_DP(px);
|
||||
yv0 = LD_DP(py);
|
||||
|
||||
px += 2;
|
||||
py += 2;
|
||||
|
||||
ST_DP(xv0, srcy);
|
||||
ST_DP(yv0, srcx);
|
||||
|
||||
srcx += 2;
|
||||
srcy += 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
x0 = px[0];
|
||||
y0 = py[0];
|
||||
srcx[0] = y0;
|
||||
srcy[0] = x0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
|
||||
ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if ((n & 4) && (n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
|
||||
LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
|
||||
ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
|
||||
ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
|
||||
LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
|
||||
ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
|
||||
ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
|
||||
}
|
||||
else if ((n & 4) && (n & 1))
|
||||
{
|
||||
LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
|
||||
LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
|
||||
ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
|
||||
ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
|
||||
}
|
||||
else if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP3_INC(px, inc_x, x0, x1, x2);
|
||||
LD_GP3_INC(py, inc_y, y0, y1, y2);
|
||||
ST_GP3_INC(x0, x1, x2, srcy, inc_y);
|
||||
ST_GP3_INC(y0, y1, y2, srcx, inc_x);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
|
||||
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
|
||||
ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
|
||||
ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(px, inc_x, x0, x1);
|
||||
LD_GP2_INC(py, inc_y, y0, y1);
|
||||
ST_GP2_INC(x0, x1, srcy, inc_y);
|
||||
ST_GP2_INC(y0, y1, srcx, inc_x);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = *srcx;
|
||||
y0 = *srcy;
|
||||
|
||||
*srcx = y0;
|
||||
*srcy = x0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -722,6 +722,31 @@ inline static void prefetch_load_lf(unsigned char *src)
|
|||
MUL2(in4, in5, in6, in7, out2, out3); \
|
||||
}
|
||||
|
||||
/* Description : Multiplication of pairs of vectors and added in output
|
||||
Arguments : Inputs - in0, in1, vec, out0, out1
|
||||
Outputs - out0, out1
|
||||
Details : Each element from 'in0' is multiplied with elements from 'vec'
|
||||
and the result is added to 'out0'
|
||||
*/
|
||||
#define FMADD2(in0, in1, vec, inout0, inout1) \
|
||||
{ \
|
||||
inout0 += in0 * vec; \
|
||||
inout1 += in1 * vec; \
|
||||
}
|
||||
#define FMADD3(in0, in1, in2, vec, \
|
||||
inout0, inout1, inout2) \
|
||||
{ \
|
||||
inout0 += in0 * vec; \
|
||||
inout1 += in1 * vec; \
|
||||
inout2 += in2 * vec; \
|
||||
}
|
||||
#define FMADD4(in0, in1, in2, in3, vec, \
|
||||
inout0, inout1, inout2, inout3) \
|
||||
{ \
|
||||
FMADD2(in0, in1, vec, inout0, inout1); \
|
||||
FMADD2(in2, in3, vec, inout2, inout3); \
|
||||
}
|
||||
|
||||
/* Description : Addition of 2 pairs of variables
|
||||
Arguments : Inputs - in0, in1, in2, in3
|
||||
Outputs - out0, out1
|
||||
|
|
|
@ -0,0 +1,265 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#if !defined(CONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *py;
|
||||
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
v4f32 da_vec, zero_v = {0};
|
||||
|
||||
if ((n < 0) || (da == 0.0)) return(0);
|
||||
|
||||
py = y;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
FLOAT *x_pref, *y_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 64;
|
||||
|
||||
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
y_pref = y + pref_offset + 64;
|
||||
|
||||
da_vec = COPY_FLOAT_TO_VECTOR(da);
|
||||
|
||||
for (i = (n >> 5); i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(y_pref, 0);
|
||||
PREF_OFFSET(y_pref, 32);
|
||||
PREF_OFFSET(y_pref, 64);
|
||||
PREF_OFFSET(y_pref, 96);
|
||||
x_pref += 32;
|
||||
y_pref += 32;
|
||||
|
||||
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_SP8_INC(py, 4, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
|
||||
FMADD4(x4, x5, x6, x7, da_vec, y4, y5, y6, y7);
|
||||
ST_SP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 4);
|
||||
}
|
||||
|
||||
if (n & 31)
|
||||
{
|
||||
if (n & 16)
|
||||
{
|
||||
LD_SP4_INC(x, 4, x0, x1, x2, x3);
|
||||
LD_SP4_INC(py, 4, y0, y1, y2, y3);
|
||||
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
|
||||
ST_SP4_INC(y0, y1, y2, y3, y, 4);
|
||||
}
|
||||
|
||||
if (n & 8)
|
||||
{
|
||||
LD_SP2_INC(x, 4, x0, x1);
|
||||
LD_SP2_INC(py, 4, y0, y1);
|
||||
FMADD2(x0, x1, da_vec, y0, y1);
|
||||
ST_SP2_INC(y0, y1, y, 4);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
x0 = LD_SP(x); x += 4;
|
||||
y0 = LD_SP(py); py += 4;
|
||||
y0 += da_vec * x0;
|
||||
ST_SP(y0, y); y += 4;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
FMADD2(x[0], x[1], da, y[0], y[1]);
|
||||
x += 2;
|
||||
y += 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
y[0] += da * x[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (1 == inc_y)
|
||||
{
|
||||
da_vec = COPY_FLOAT_TO_VECTOR(da);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
x2 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
x2 = (v4f32) __msa_insert_w((v4i32) x2, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
x2 = (v4f32) __msa_insert_w((v4i32) x2, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
x2 = (v4f32) __msa_insert_w((v4i32) x2, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
x3 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
x3 = (v4f32) __msa_insert_w((v4i32) x3, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
x3 = (v4f32) __msa_insert_w((v4i32) x3, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
x3 = (v4f32) __msa_insert_w((v4i32) x3, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
|
||||
LD_SP4_INC(py, 4, y0, y1, y2, y3);
|
||||
FMADD4(x0, x1, x2, x3, da_vec, y0, y1, y2, y3);
|
||||
ST_SP4_INC(y0, y1, y2, y3, y, 4);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
|
||||
LD_SP2_INC(py, 4, y0, y1);
|
||||
FMADD2(x0, x1, da_vec, y0, y1);
|
||||
ST_SP2_INC(y0, y1, y, 4);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
|
||||
y0 = LD_SP(py); py += 4;
|
||||
y0 += da_vec * x0;
|
||||
ST_SP(y0, y); y += 4;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
FMADD2(x[0], x[inc_x], da, y[0], y[1]);
|
||||
|
||||
x += 2 * inc_x;
|
||||
y += 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
y[0] += da * x[0];
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
|
||||
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
|
||||
FMADD4(x0, x1, x2, x3, da, y0, y1, y2, y3);
|
||||
ST_GP4_INC(y0, y1, y2, y3, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 3)
|
||||
{
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, inc_x, x0, x1);
|
||||
LD_GP2_INC(py, inc_y, y0, y1);
|
||||
FMADD2(x0, x1, da, y0, y1);
|
||||
ST_GP2_INC(y0, y1, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*y += da * *x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,186 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i;
|
||||
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7;
|
||||
|
||||
if (n < 0) return (0);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
if (n > 63)
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 128 + 32;
|
||||
|
||||
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
for (i = (n >> 6) - 1; i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(x_pref, 128);
|
||||
PREF_OFFSET(x_pref, 160);
|
||||
PREF_OFFSET(x_pref, 192);
|
||||
PREF_OFFSET(x_pref, 224);
|
||||
x_pref += 64;
|
||||
|
||||
x8 = LD_SP(x); x += 4;
|
||||
ST_SP(x0, y); y += 4;
|
||||
x9 = LD_SP(x); x += 4;
|
||||
ST_SP(x1, y); y += 4;
|
||||
x10 = LD_SP(x); x += 4;
|
||||
ST_SP(x2, y); y += 4;
|
||||
x11 = LD_SP(x); x += 4;
|
||||
ST_SP(x3, y); y += 4;
|
||||
x12 = LD_SP(x); x += 4;
|
||||
ST_SP(x4, y); y += 4;
|
||||
x13 = LD_SP(x); x += 4;
|
||||
ST_SP(x5, y); y += 4;
|
||||
x14 = LD_SP(x); x += 4;
|
||||
ST_SP(x6, y); y += 4;
|
||||
x15 = LD_SP(x); x += 4;
|
||||
ST_SP(x7, y); y += 4;
|
||||
x0 = LD_SP(x); x += 4;
|
||||
ST_SP(x8, y); y += 4;
|
||||
x1 = LD_SP(x); x += 4;
|
||||
ST_SP(x9, y); y += 4;
|
||||
x2 = LD_SP(x); x += 4;
|
||||
ST_SP(x10, y); y += 4;
|
||||
x3 = LD_SP(x); x += 4;
|
||||
ST_SP(x11, y); y += 4;
|
||||
x4 = LD_SP(x); x += 4;
|
||||
ST_SP(x12, y); y += 4;
|
||||
x5 = LD_SP(x); x += 4;
|
||||
ST_SP(x13, y); y += 4;
|
||||
x6 = LD_SP(x); x += 4;
|
||||
ST_SP(x14, y); y += 4;
|
||||
x7 = LD_SP(x); x += 4;
|
||||
ST_SP(x15, y); y += 4;
|
||||
}
|
||||
|
||||
x8 = LD_SP(x); x += 4;
|
||||
x9 = LD_SP(x); x += 4;
|
||||
ST_SP(x0, y); y += 4;
|
||||
x10 = LD_SP(x); x += 4;
|
||||
ST_SP(x1, y); y += 4;
|
||||
x11 = LD_SP(x); x += 4;
|
||||
ST_SP(x2, y); y += 4;
|
||||
x12 = LD_SP(x); x += 4;
|
||||
ST_SP(x3, y); y += 4;
|
||||
x13 = LD_SP(x); x += 4;
|
||||
ST_SP(x4, y); y += 4;
|
||||
x14 = LD_SP(x); x += 4;
|
||||
ST_SP(x5, y); y += 4;
|
||||
x15 = LD_SP(x); x += 4;
|
||||
ST_SP(x6, y); y += 4;
|
||||
ST_SP(x7, y); y += 4;
|
||||
|
||||
ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 4);
|
||||
}
|
||||
|
||||
if (n & 63)
|
||||
{
|
||||
if (n & 32)
|
||||
{
|
||||
LD_SP8_INC(x, 4, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 4);
|
||||
}
|
||||
|
||||
if (n & 16)
|
||||
{
|
||||
LD_SP4_INC(x, 4, x0, x1, x2, x3);
|
||||
ST_SP4_INC(x0, x1, x2, x3, y, 4);
|
||||
}
|
||||
|
||||
if (n & 8)
|
||||
{
|
||||
LD_SP2_INC(x, 4, x0, x1);
|
||||
ST_SP2_INC(x0, x1, y, 4);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_GP4_INC(x, 1, f0, f1, f2, f3);
|
||||
ST_GP4_INC(f0, f1, f2, f3, y, 1);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, 1, f0, f1);
|
||||
ST_GP2_INC(f0, f1, y, 1);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_GP8_INC(x, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
|
||||
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_GP4_INC(x, inc_x, f0, f1, f2, f3);
|
||||
ST_GP4_INC(f0, f1, f2, f3, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, inc_x, f0, f1);
|
||||
ST_GP2_INC(f0, f1, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*y = *x;
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,385 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
|
||||
BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||
BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT *px;
|
||||
FLOAT f0, f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15;
|
||||
v4f32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
v4f32 da_vec;
|
||||
|
||||
px = x;
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
if (0.0 == da)
|
||||
{
|
||||
v4f32 zero_v = __msa_cast_to_vector_float(0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
|
||||
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
|
||||
|
||||
for (i = (n >> 6); i--;)
|
||||
{
|
||||
ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, 4);
|
||||
ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, 4);
|
||||
}
|
||||
|
||||
if (n & 63)
|
||||
{
|
||||
if (n & 32)
|
||||
{
|
||||
ST_SP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, 4);
|
||||
}
|
||||
|
||||
if (n & 16)
|
||||
{
|
||||
ST_SP4_INC(zero_v, zero_v, zero_v, zero_v, x, 4);
|
||||
}
|
||||
|
||||
if (n & 8)
|
||||
{
|
||||
ST_SP2_INC(zero_v, zero_v, x, 4);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
*x = 0; x += 1;
|
||||
*x = 0; x += 1;
|
||||
*x = 0; x += 1;
|
||||
*x = 0; x += 1;
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
*x = 0; x += 1;
|
||||
*x = 0; x += 1;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*x = 0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
da_vec = COPY_FLOAT_TO_VECTOR(da);
|
||||
|
||||
if (n > 63)
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 64 + 32;
|
||||
|
||||
LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
for (i = 0; i < (n >> 6) - 1; i++)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(x_pref, 128);
|
||||
PREF_OFFSET(x_pref, 160);
|
||||
PREF_OFFSET(x_pref, 192);
|
||||
PREF_OFFSET(x_pref, 224);
|
||||
x_pref += 64;
|
||||
|
||||
x8 = LD_SP(px); px += 4;
|
||||
x0 *= da_vec;
|
||||
x9 = LD_SP(px); px += 4;
|
||||
x1 *= da_vec;
|
||||
x10 = LD_SP(px); px += 4;
|
||||
x2 *= da_vec;
|
||||
x11 = LD_SP(px); px += 4;
|
||||
x3 *= da_vec;
|
||||
x12 = LD_SP(px); px += 4;
|
||||
x4 *= da_vec;
|
||||
x13 = LD_SP(px); px += 4;
|
||||
x5 *= da_vec;
|
||||
x14 = LD_SP(px); px += 4;
|
||||
x6 *= da_vec;
|
||||
x15 = LD_SP(px); px += 4;
|
||||
x7 *= da_vec;
|
||||
x8 *= da_vec;
|
||||
ST_SP(x0, x); x += 4;
|
||||
x9 *= da_vec;
|
||||
ST_SP(x1, x); x += 4;
|
||||
x10 *= da_vec;
|
||||
ST_SP(x2, x); x += 4;
|
||||
x11 *= da_vec;
|
||||
ST_SP(x3, x); x += 4;
|
||||
x12 *= da_vec;
|
||||
ST_SP(x4, x); x += 4;
|
||||
x13 *= da_vec;
|
||||
ST_SP(x5, x); x += 4;
|
||||
x14 *= da_vec;
|
||||
ST_SP(x6, x); x += 4;
|
||||
x15 *= da_vec;
|
||||
ST_SP(x7, x); x += 4;
|
||||
ST_SP(x8, x); x += 4;
|
||||
x0 = LD_SP(px); px += 4;
|
||||
ST_SP(x9, x); x += 4;
|
||||
x1 = LD_SP(px); px += 4;
|
||||
ST_SP(x10, x); x += 4;
|
||||
x2 = LD_SP(px); px += 4;
|
||||
ST_SP(x11, x); x += 4;
|
||||
x3 = LD_SP(px); px += 4;
|
||||
ST_SP(x12, x); x += 4;
|
||||
x4 = LD_SP(px); px += 4;
|
||||
ST_SP(x13, x); x += 4;
|
||||
x5 = LD_SP(px); px += 4;
|
||||
ST_SP(x14, x); x += 4;
|
||||
x6 = LD_SP(px); px += 4;
|
||||
ST_SP(x15, x); x += 4;
|
||||
x7 = LD_SP(px); px += 4;
|
||||
}
|
||||
|
||||
x8 = LD_SP(px); px += 4;
|
||||
x0 *= da_vec;
|
||||
x9 = LD_SP(px); px += 4;
|
||||
x1 *= da_vec;
|
||||
x10 = LD_SP(px); px += 4;
|
||||
x2 *= da_vec;
|
||||
x11 = LD_SP(px); px += 4;
|
||||
x3 *= da_vec;
|
||||
x12 = LD_SP(px); px += 4;
|
||||
x4 *= da_vec;
|
||||
x13 = LD_SP(px); px += 4;
|
||||
x5 *= da_vec;
|
||||
x14 = LD_SP(px); px += 4;
|
||||
x6 *= da_vec;
|
||||
x15 = LD_SP(px); px += 4;
|
||||
x7 *= da_vec;
|
||||
x8 *= da_vec;
|
||||
ST_SP(x0, x); x += 4;
|
||||
x9 *= da_vec;
|
||||
ST_SP(x1, x); x += 4;
|
||||
x10 *= da_vec;
|
||||
ST_SP(x2, x); x += 4;
|
||||
x11 *= da_vec;
|
||||
ST_SP(x3, x); x += 4;
|
||||
x12 *= da_vec;
|
||||
ST_SP(x4, x); x += 4;
|
||||
x13 *= da_vec;
|
||||
ST_SP(x5, x); x += 4;
|
||||
x15 *= da_vec;
|
||||
ST_SP(x6, x); x += 4;
|
||||
x14 *= da_vec;
|
||||
ST_SP(x7, x); x += 4;
|
||||
|
||||
ST_SP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, x, 4);
|
||||
}
|
||||
|
||||
if (n & 63)
|
||||
{
|
||||
if (n & 32)
|
||||
{
|
||||
LD_SP8_INC(px, 4, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
|
||||
MUL4(x4, da_vec, x5, da_vec, x6, da_vec, x7, da_vec, x4, x5, x6, x7);
|
||||
ST_SP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 4);
|
||||
}
|
||||
|
||||
if (n & 16)
|
||||
{
|
||||
LD_SP4_INC(px, 4, x0, x1, x2, x3);
|
||||
MUL4(x0, da_vec, x1, da_vec, x2, da_vec, x3, da_vec, x0, x1, x2, x3);
|
||||
ST_SP4_INC(x0, x1, x2, x3, x, 4);
|
||||
}
|
||||
|
||||
if (n & 8)
|
||||
{
|
||||
LD_SP2_INC(px, 4, x0, x1);
|
||||
MUL2(x0, da_vec, x1, da_vec, x0, x1);
|
||||
ST_SP2_INC(x0, x1, x, 4);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_GP4_INC(px, 1, f0, f1, f2, f3);
|
||||
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
|
||||
ST_GP4_INC(f0, f1, f2, f3, x, 1);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(px, 1, f0, f1);
|
||||
MUL2(f0, da, f1, da, f0, f1);
|
||||
ST_GP2_INC(f0, f1, x, 1);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*x *= da;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (0.0 == da)
|
||||
{
|
||||
for (i = n; i--;)
|
||||
{
|
||||
*x = 0;
|
||||
x += inc_x;
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (n > 15)
|
||||
{
|
||||
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
|
||||
|
||||
for (i = 0; i < (n >> 4) - 1; i++)
|
||||
{
|
||||
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
|
||||
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
|
||||
|
||||
f4 *= da;
|
||||
f5 *= da;
|
||||
*x = f0; x += inc_x;
|
||||
f6 *= da;
|
||||
*x = f1; x += inc_x;
|
||||
f7 *= da;
|
||||
*x = f2; x += inc_x;
|
||||
f8 *= da;
|
||||
*x = f3; x += inc_x;
|
||||
f9 *= da;
|
||||
*x = f4; x += inc_x;
|
||||
f10 *= da;
|
||||
*x = f5; x += inc_x;
|
||||
f11 *= da;
|
||||
*x = f6; x += inc_x;
|
||||
f12 *= da;
|
||||
*x = f7; x += inc_x;
|
||||
f13 *= da;
|
||||
*x = f8; x += inc_x;
|
||||
f14 *= da;
|
||||
*x = f9; x += inc_x;
|
||||
f15 *= da;
|
||||
*x = f10; x += inc_x;
|
||||
*x = f11; x += inc_x;
|
||||
f0 = *px; px += inc_x;
|
||||
*x = f12; x += inc_x;
|
||||
f1 = *px; px += inc_x;
|
||||
*x = f13; x += inc_x;
|
||||
f2 = *px; px += inc_x;
|
||||
*x = f14; x += inc_x;
|
||||
f3 = *px; px += inc_x;
|
||||
*x = f15; x += inc_x;
|
||||
f4 = *px; px += inc_x;
|
||||
f5 = *px; px += inc_x;
|
||||
f6 = *px; px += inc_x;
|
||||
f7 = *px; px += inc_x;
|
||||
}
|
||||
|
||||
LD_GP8_INC(px, inc_x, f8, f9, f10, f11, f12, f13, f14, f15);
|
||||
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
|
||||
|
||||
f4 *= da;
|
||||
f5 *= da;
|
||||
*x = f0; x += inc_x;
|
||||
f6 *= da;
|
||||
*x = f1; x += inc_x;
|
||||
f7 *= da;
|
||||
*x = f2; x += inc_x;
|
||||
f8 *= da;
|
||||
*x = f3; x += inc_x;
|
||||
f9 *= da;
|
||||
*x = f4; x += inc_x;
|
||||
f10 *= da;
|
||||
*x = f5; x += inc_x;
|
||||
f11 *= da;
|
||||
*x = f6; x += inc_x;
|
||||
f12 *= da;
|
||||
*x = f7; x += inc_x;
|
||||
f13 *= da;
|
||||
*x = f8; x += inc_x;
|
||||
f14 *= da;
|
||||
*x = f9; x += inc_x;
|
||||
f15 *= da;
|
||||
*x = f10; x += inc_x;
|
||||
*x = f11; x += inc_x;
|
||||
*x = f12; x += inc_x;
|
||||
*x = f13; x += inc_x;
|
||||
*x = f14; x += inc_x;
|
||||
*x = f15; x += inc_x;
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_GP8_INC(px, inc_x, f0, f1, f2, f3, f4, f5, f6, f7);
|
||||
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
|
||||
MUL4(f4, da, f5, da, f6, da, f7, da, f4, f5, f6, f7);
|
||||
ST_GP8_INC(f0, f1, f2, f3, f4, f5, f6, f7, x, inc_x);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_GP4_INC(px, inc_x, f0, f1, f2, f3);
|
||||
MUL4(f0, da, f1, da, f2, da, f3, da, f0, f1, f2, f3);
|
||||
ST_GP4_INC(f0, f1, f2, f3, x, inc_x);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(px, inc_x, f0, f1);
|
||||
MUL2(f0, da, f1, da, f0, f1);
|
||||
ST_GP2_INC(f0, f1, x, inc_x);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
*x *= da;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -0,0 +1,267 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||
FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy, BLASLONG inc_y,
|
||||
FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i = 0, pref_offsetx, pref_offsety;
|
||||
FLOAT *px, *py;
|
||||
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
v4f32 xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7;
|
||||
v4f32 yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7;
|
||||
|
||||
if (n < 0) return (0);
|
||||
|
||||
pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offsetx > 0)
|
||||
{
|
||||
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
|
||||
pref_offsetx = pref_offsetx / sizeof(FLOAT);
|
||||
}
|
||||
|
||||
pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offsety > 0)
|
||||
{
|
||||
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
|
||||
pref_offsety = pref_offsety / sizeof(FLOAT);
|
||||
}
|
||||
|
||||
px = srcx;
|
||||
py = srcy;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
if (n >> 5)
|
||||
{
|
||||
LD_SP8_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7);
|
||||
|
||||
for (i = (n >> 5) - 1; i--;)
|
||||
{
|
||||
PREFETCH(px + pref_offsetx + 32);
|
||||
PREFETCH(px + pref_offsetx + 40);
|
||||
PREFETCH(px + pref_offsetx + 48);
|
||||
PREFETCH(px + pref_offsetx + 56);
|
||||
|
||||
PREFETCH(py + pref_offsety + 32);
|
||||
PREFETCH(py + pref_offsety + 40);
|
||||
PREFETCH(py + pref_offsety + 48);
|
||||
PREFETCH(py + pref_offsety + 56);
|
||||
|
||||
yv0 = LD_SP(py); py += 4;
|
||||
ST_SP(xv0, srcy); srcy += 4;
|
||||
yv1 = LD_SP(py); py += 4;
|
||||
ST_SP(xv1, srcy); srcy += 4;
|
||||
yv2 = LD_SP(py); py += 4;
|
||||
ST_SP(xv2, srcy); srcy += 4;
|
||||
yv3 = LD_SP(py); py += 4;
|
||||
ST_SP(xv3, srcy); srcy += 4;
|
||||
yv4 = LD_SP(py); py += 4;
|
||||
ST_SP(xv4, srcy); srcy += 4;
|
||||
yv5 = LD_SP(py); py += 4;
|
||||
ST_SP(xv5, srcy); srcy += 4;
|
||||
yv6 = LD_SP(py); py += 4;
|
||||
ST_SP(xv6, srcy); srcy += 4;
|
||||
yv7 = LD_SP(py); py += 4;
|
||||
ST_SP(xv7, srcy); srcy += 4;
|
||||
|
||||
xv0 = LD_SP(px); px += 4;
|
||||
ST_SP(yv0, srcx); srcx += 4;
|
||||
xv1 = LD_SP(px); px += 4;
|
||||
ST_SP(yv1, srcx); srcx += 4;
|
||||
xv2 = LD_SP(px); px += 4;
|
||||
ST_SP(yv2, srcx); srcx += 4;
|
||||
xv3 = LD_SP(px); px += 4;
|
||||
ST_SP(yv3, srcx); srcx += 4;
|
||||
xv4 = LD_SP(px); px += 4;
|
||||
ST_SP(yv4, srcx); srcx += 4;
|
||||
xv5 = LD_SP(px); px += 4;
|
||||
ST_SP(yv5, srcx); srcx += 4;
|
||||
xv6 = LD_SP(px); px += 4;
|
||||
ST_SP(yv6, srcx); srcx += 4;
|
||||
xv7 = LD_SP(px); px += 4;
|
||||
ST_SP(yv7, srcx); srcx += 4;
|
||||
}
|
||||
|
||||
LD_SP8_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7);
|
||||
ST_SP8_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, xv7, srcy, 4);
|
||||
ST_SP8_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, yv7, srcx, 4);
|
||||
}
|
||||
|
||||
if (n & 31)
|
||||
{
|
||||
if ((n & 16) && (n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP7_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5, xv6);
|
||||
LD_SP7_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5, yv6);
|
||||
ST_SP7_INC(xv0, xv1, xv2, xv3, xv4, xv5, xv6, srcy, 4);
|
||||
ST_SP7_INC(yv0, yv1, yv2, yv3, yv4, yv5, yv6, srcx, 4);
|
||||
}
|
||||
else if ((n & 16) && (n & 8))
|
||||
{
|
||||
LD_SP6_INC(px, 4, xv0, xv1, xv2, xv3, xv4, xv5);
|
||||
LD_SP6_INC(py, 4, yv0, yv1, yv2, yv3, yv4, yv5);
|
||||
ST_SP6_INC(xv0, xv1, xv2, xv3, xv4, xv5, srcy, 4);
|
||||
ST_SP6_INC(yv0, yv1, yv2, yv3, yv4, yv5, srcx, 4);
|
||||
}
|
||||
else if ((n & 16) && (n & 4))
|
||||
{
|
||||
LD_SP5_INC(px, 4, xv0, xv1, xv2, xv3, xv4);
|
||||
LD_SP5_INC(py, 4, yv0, yv1, yv2, yv3, yv4);
|
||||
ST_SP5_INC(xv0, xv1, xv2, xv3, xv4, srcy, 4);
|
||||
ST_SP5_INC(yv0, yv1, yv2, yv3, yv4, srcx, 4);
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP3_INC(px, 4, xv0, xv1, xv2);
|
||||
LD_SP3_INC(py, 4, yv0, yv1, yv2);
|
||||
ST_SP3_INC(xv0, xv1, xv2, srcy, 4);
|
||||
ST_SP3_INC(yv0, yv1, yv2, srcx, 4);
|
||||
}
|
||||
else if (n & 16)
|
||||
{
|
||||
LD_SP4_INC(px, 4, xv0, xv1, xv2, xv3);
|
||||
LD_SP4_INC(py, 4, yv0, yv1, yv2, yv3);
|
||||
ST_SP4_INC(xv0, xv1, xv2, xv3, srcy, 4);
|
||||
ST_SP4_INC(yv0, yv1, yv2, yv3, srcx, 4);
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_SP2_INC(px, 4, xv0, xv1);
|
||||
LD_SP2_INC(py, 4, yv0, yv1);
|
||||
ST_SP2_INC(xv0, xv1, srcy, 4);
|
||||
ST_SP2_INC(yv0, yv1, srcx, 4);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
xv0 = LD_SP(px);
|
||||
yv0 = LD_SP(py);
|
||||
|
||||
px += 4;
|
||||
py += 4;
|
||||
|
||||
ST_SP(xv0, srcy);
|
||||
ST_SP(yv0, srcx);
|
||||
|
||||
srcx += 4;
|
||||
srcy += 4;
|
||||
}
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP3_INC(px, 1, x0, x1, x3);
|
||||
LD_GP3_INC(py, 1, y0, y1, y3);
|
||||
ST_GP3_INC(x0, x1, x3, srcy, 1);
|
||||
ST_GP3_INC(y0, y1, y3, srcx, 1);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(px, 1, x0, x1);
|
||||
LD_GP2_INC(py, 1, y0, y1);
|
||||
ST_GP2_INC(x0, x1, srcy, 1);
|
||||
ST_GP2_INC(y0, y1, srcx, 1);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = px[0];
|
||||
y0 = py[0];
|
||||
srcx[0] = y0;
|
||||
srcy[0] = x0;
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_GP8_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_GP8_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
ST_GP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y);
|
||||
ST_GP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if ((n & 4) && (n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP7_INC(px, inc_x, x0, x1, x2, x3, x4, x5, x6);
|
||||
LD_GP7_INC(py, inc_y, y0, y1, y2, y3, y4, y5, y6);
|
||||
ST_GP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y);
|
||||
ST_GP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_GP6_INC(px, inc_x, x0, x1, x2, x3, x4, x5);
|
||||
LD_GP6_INC(py, inc_y, y0, y1, y2, y3, y4, y5);
|
||||
ST_GP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y);
|
||||
ST_GP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x);
|
||||
}
|
||||
else if ((n & 4) && (n & 1))
|
||||
{
|
||||
LD_GP5_INC(px, inc_x, x0, x1, x2, x3, x4);
|
||||
LD_GP5_INC(py, inc_y, y0, y1, y2, y3, y4);
|
||||
ST_GP5_INC(x0, x1, x2, x3, x4, srcy, inc_y);
|
||||
ST_GP5_INC(y0, y1, y2, y3, y4, srcx, inc_x);
|
||||
}
|
||||
else if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP3_INC(px, inc_x, x0, x1, x2);
|
||||
LD_GP3_INC(py, inc_y, y0, y1, y2);
|
||||
ST_GP3_INC(x0, x1, x2, srcy, inc_y);
|
||||
ST_GP3_INC(y0, y1, y2, srcx, inc_x);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_GP4_INC(px, inc_x, x0, x1, x2, x3);
|
||||
LD_GP4_INC(py, inc_y, y0, y1, y2, y3);
|
||||
ST_GP4_INC(x0, x1, x2, x3, srcy, inc_y);
|
||||
ST_GP4_INC(y0, y1, y2, y3, srcx, inc_x);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(px, inc_x, x0, x1);
|
||||
LD_GP2_INC(py, inc_y, y0, y1);
|
||||
ST_GP2_INC(x0, x1, srcy, inc_y);
|
||||
ST_GP2_INC(y0, y1, srcx, inc_x);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = *srcx;
|
||||
y0 = *srcy;
|
||||
|
||||
*srcx = y0;
|
||||
*srcy = x0;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,494 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#if !defined(CONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||
BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i, inc_x2, inc_y2;
|
||||
FLOAT *py;
|
||||
v2f64 x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
v2f64 y0, y1, y2, y3, y4, y5, y6, y7, dar_vec, dai_vec;
|
||||
v2f64 x0r, x1r, x2r, x3r, x0i, x1i, x2i, x3i;
|
||||
v2f64 y0r, y1r, y2r, y3r, y0i, y1i, y2i, y3i;
|
||||
FLOAT xd0, xd1, yd0, yd1;
|
||||
|
||||
if (n < 0) return(0);
|
||||
if ((da_r == 0.0) && (da_i == 0.0)) return(0);
|
||||
|
||||
py = y;
|
||||
|
||||
dar_vec = COPY_DOUBLE_TO_VECTOR(da_r);
|
||||
dai_vec = COPY_DOUBLE_TO_VECTOR(da_i);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
FLOAT *x_pref, *y_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 32;
|
||||
|
||||
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
y_pref = y + pref_offset + 32;
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(y_pref, 0);
|
||||
PREF_OFFSET(y_pref, 32);
|
||||
PREF_OFFSET(y_pref, 64);
|
||||
PREF_OFFSET(y_pref, 96);
|
||||
x_pref += 16;
|
||||
y_pref += 16;
|
||||
|
||||
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
|
||||
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
|
||||
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
|
||||
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
|
||||
PCKEVOD_D2_DP(y7, y6, y3r, y3i);
|
||||
|
||||
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y2i OP0 dar_vec * x2i;
|
||||
y3i OP0 dar_vec * x3i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y2r OP1 dai_vec * x2i;
|
||||
y3r OP1 dai_vec * x3i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
y2i OP2 dai_vec * x2r;
|
||||
y3i OP2 dai_vec * x3r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ILVRL_D2_DP(y1i, y1r, y2, y3);
|
||||
ILVRL_D2_DP(y2i, y2r, y4, y5);
|
||||
ILVRL_D2_DP(y3i, y3r, y6, y7);
|
||||
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(x, 2, x0, x1, x2, x3);
|
||||
LD_DP4_INC(py, 2, y0, y1, y2, y3);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
|
||||
|
||||
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ILVRL_D2_DP(y1i, y1r, y2, y3);
|
||||
ST_DP4_INC(y0, y1, y2, y3, y, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(x, 2, x0, x1);
|
||||
LD_DP2_INC(py, 2, y0, y1);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
|
||||
y0r += dar_vec * x0r;
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ST_DP2_INC(y0, y1, y, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, xd0, xd1);
|
||||
LD_GP2_INC(py, 1, yd0, yd1);
|
||||
|
||||
yd0 += da_r * xd0;
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd1 OP2 da_i * xd0;
|
||||
|
||||
ST_GP2_INC(yd0, yd1, y, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (1 == inc_y)
|
||||
{
|
||||
FLOAT *y_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)y & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
y_pref = y + pref_offset + 32;
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
PREF_OFFSET(y_pref, 0);
|
||||
PREF_OFFSET(y_pref, 32);
|
||||
PREF_OFFSET(y_pref, 64);
|
||||
PREF_OFFSET(y_pref, 96);
|
||||
y_pref += 16;
|
||||
|
||||
LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
|
||||
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
|
||||
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
|
||||
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
|
||||
PCKEVOD_D2_DP(y7, y6, y3r, y3i);
|
||||
|
||||
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y2i OP0 dar_vec * x2i;
|
||||
y3i OP0 dar_vec * x3i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y2r OP1 dai_vec * x2i;
|
||||
y3r OP1 dai_vec * x3i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
y2i OP2 dai_vec * x2r;
|
||||
y3i OP2 dai_vec * x3r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ILVRL_D2_DP(y1i, y1r, y2, y3);
|
||||
ILVRL_D2_DP(y2i, y2r, y4, y5);
|
||||
ILVRL_D2_DP(y3i, y3r, y6, y7);
|
||||
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, 2);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
|
||||
LD_DP4_INC(py, 2, y0, y1, y2, y3);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
|
||||
|
||||
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ILVRL_D2_DP(y1i, y1r, y2, y3);
|
||||
ST_DP4_INC(y0, y1, y2, y3, y, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(x, inc_x2, x0, x1);
|
||||
LD_DP2_INC(py, 2, y0, y1);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
|
||||
y0r += dar_vec * x0r;
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ST_DP2_INC(y0, y1, y, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, xd0, xd1);
|
||||
LD_GP2_INC(py, 1, yd0, yd1);
|
||||
|
||||
yd0 += da_r * xd0;
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd1 OP2 da_i * xd0;
|
||||
|
||||
ST_GP2_INC(yd0, yd1, y, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (1 == inc_x)
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 32;
|
||||
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
x_pref += 16;
|
||||
|
||||
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
|
||||
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
|
||||
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
|
||||
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
|
||||
PCKEVOD_D2_DP(y7, y6, y3r, y3i);
|
||||
|
||||
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y2i OP0 dar_vec * x2i;
|
||||
y3i OP0 dar_vec * x3i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y2r OP1 dai_vec * x2i;
|
||||
y3r OP1 dai_vec * x3i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
y2i OP2 dai_vec * x2r;
|
||||
y3i OP2 dai_vec * x3r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ILVRL_D2_DP(y1i, y1r, y2, y3);
|
||||
ILVRL_D2_DP(y2i, y2r, y4, y5);
|
||||
ILVRL_D2_DP(y3i, y3r, y6, y7);
|
||||
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(x, 2, x0, x1, x2, x3);
|
||||
LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
|
||||
|
||||
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ILVRL_D2_DP(y1i, y1r, y2, y3);
|
||||
ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(x, 2, x0, x1);
|
||||
LD_DP2_INC(py, inc_y2, y0, y1);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
|
||||
y0r += dar_vec * x0r;
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ST_DP2_INC(y0, y1, y, inc_y2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, xd0, xd1);
|
||||
LD_GP2_INC(py, 1, yd0, yd1);
|
||||
|
||||
yd0 += da_r * xd0;
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd1 OP2 da_i * xd0;
|
||||
|
||||
ST_GP2_INC(yd0, yd1, y, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP8_INC(x, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
|
||||
PCKEVOD_D2_DP(x5, x4, x2r, x2i);
|
||||
PCKEVOD_D2_DP(y5, y4, y2r, y2i);
|
||||
PCKEVOD_D2_DP(x7, x6, x3r, x3i);
|
||||
PCKEVOD_D2_DP(y7, y6, y3r, y3i);
|
||||
|
||||
FMADD4(x0r, x1r, x2r, x3r, dar_vec, y0r, y1r, y2r, y3r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y2i OP0 dar_vec * x2i;
|
||||
y3i OP0 dar_vec * x3i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y2r OP1 dai_vec * x2i;
|
||||
y3r OP1 dai_vec * x3i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
y2i OP2 dai_vec * x2r;
|
||||
y3i OP2 dai_vec * x3r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ILVRL_D2_DP(y1i, y1r, y2, y3);
|
||||
ILVRL_D2_DP(y2i, y2r, y4, y5);
|
||||
ILVRL_D2_DP(y3i, y3r, y6, y7);
|
||||
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, y, inc_y2);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(x, inc_x2, x0, x1, x2, x3);
|
||||
LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i);
|
||||
PCKEVOD_D2_DP(y3, y2, y1r, y1i);
|
||||
|
||||
FMADD2(x0r, x1r, dar_vec, y0r, y1r);
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y1i OP0 dar_vec * x1i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y1r OP1 dai_vec * x1i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
y1i OP2 dai_vec * x1r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ILVRL_D2_DP(y1i, y1r, y2, y3);
|
||||
ST_DP4_INC(y0, y1, y2, y3, y, inc_y2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(x, inc_x2, x0, x1);
|
||||
LD_DP2_INC(py, inc_y2, y0, y1);
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i);
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i);
|
||||
|
||||
y0r += dar_vec * x0r;
|
||||
y0i OP0 dar_vec * x0i;
|
||||
y0r OP1 dai_vec * x0i;
|
||||
y0i OP2 dai_vec * x0r;
|
||||
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1);
|
||||
ST_DP2_INC(y0, y1, y, inc_y2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, xd0, xd1);
|
||||
LD_GP2_INC(py, 1, yd0, yd1);
|
||||
|
||||
yd0 += da_r * xd0;
|
||||
yd1 OP0 da_r * xd1;
|
||||
yd0 OP1 da_i * xd1;
|
||||
yd1 OP2 da_i * xd0;
|
||||
|
||||
ST_GP2_INC(yd0, yd1, y, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,218 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
{
|
||||
BLASLONG i;
|
||||
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
FLOAT f0, f1;
|
||||
|
||||
if (n < 0) return (0);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
if (n > 15)
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 64 + 16;
|
||||
|
||||
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
for (i = (n >> 4) - 1; i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(x_pref, 128);
|
||||
PREF_OFFSET(x_pref, 160);
|
||||
PREF_OFFSET(x_pref, 192);
|
||||
PREF_OFFSET(x_pref, 224);
|
||||
x_pref += 32;
|
||||
|
||||
x8 = LD_DP(x); x += 2;
|
||||
ST_DP(x0, y); y += 2;
|
||||
x9 = LD_DP(x); x += 2;
|
||||
ST_DP(x1, y); y += 2;
|
||||
x10 = LD_DP(x); x += 2;
|
||||
ST_DP(x2, y); y += 2;
|
||||
x11 = LD_DP(x); x += 2;
|
||||
ST_DP(x3, y); y += 2;
|
||||
x12 = LD_DP(x); x += 2;
|
||||
ST_DP(x4, y); y += 2;
|
||||
x13 = LD_DP(x); x += 2;
|
||||
ST_DP(x5, y); y += 2;
|
||||
x14 = LD_DP(x); x += 2;
|
||||
ST_DP(x6, y); y += 2;
|
||||
x15 = LD_DP(x); x += 2;
|
||||
ST_DP(x7, y); y += 2;
|
||||
x0 = LD_DP(x); x += 2;
|
||||
ST_DP(x8, y); y += 2;
|
||||
x1 = LD_DP(x); x += 2;
|
||||
ST_DP(x9, y); y += 2;
|
||||
x2 = LD_DP(x); x += 2;
|
||||
ST_DP(x10, y); y += 2;
|
||||
x3 = LD_DP(x); x += 2;
|
||||
ST_DP(x11, y); y += 2;
|
||||
x4 = LD_DP(x); x += 2;
|
||||
ST_DP(x12, y); y += 2;
|
||||
x5 = LD_DP(x); x += 2;
|
||||
ST_DP(x13, y); y += 2;
|
||||
x6 = LD_DP(x); x += 2;
|
||||
ST_DP(x14, y); y += 2;
|
||||
x7 = LD_DP(x); x += 2;
|
||||
ST_DP(x15, y); y += 2;
|
||||
}
|
||||
|
||||
x8 = LD_DP(x); x += 2;
|
||||
x9 = LD_DP(x); x += 2;
|
||||
ST_DP(x0, y); y += 2;
|
||||
x10 = LD_DP(x); x += 2;
|
||||
ST_DP(x1, y); y += 2;
|
||||
x11 = LD_DP(x); x += 2;
|
||||
ST_DP(x2, y); y += 2;
|
||||
x12 = LD_DP(x); x += 2;
|
||||
ST_DP(x3, y); y += 2;
|
||||
x13 = LD_DP(x); x += 2;
|
||||
ST_DP(x4, y); y += 2;
|
||||
x14 = LD_DP(x); x += 2;
|
||||
ST_DP(x5, y); y += 2;
|
||||
x15 = LD_DP(x); x += 2;
|
||||
ST_DP(x6, y); y += 2;
|
||||
ST_DP(x7, y); y += 2;
|
||||
|
||||
ST_DP8_INC(x8, x9, x10, x11, x12, x13, x14, x15, y, 2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP8_INC(x, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(x, 2, x0, x1, x2, x3);
|
||||
ST_DP4_INC(x0, x1, x2, x3, y, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(x, 2, x0, x1);
|
||||
ST_DP2_INC(x0, x1, y, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, f0, f1);
|
||||
ST_GP2_INC(f0, f1, y, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x *= 2;
|
||||
inc_y *= 2;
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
x0 = LD_DP(x); x += inc_x;
|
||||
x1 = LD_DP(x); x += inc_x;
|
||||
x2 = LD_DP(x); x += inc_x;
|
||||
x3 = LD_DP(x); x += inc_x;
|
||||
x4 = LD_DP(x); x += inc_x;
|
||||
x5 = LD_DP(x); x += inc_x;
|
||||
x6 = LD_DP(x); x += inc_x;
|
||||
x7 = LD_DP(x); x += inc_x;
|
||||
x8 = LD_DP(x); x += inc_x;
|
||||
ST_DP(x0, y); y += inc_y;
|
||||
x9 = LD_DP(x); x += inc_x;
|
||||
ST_DP(x1, y); y += inc_y;
|
||||
x10 = LD_DP(x); x += inc_x;
|
||||
ST_DP(x2, y); y += inc_y;
|
||||
x11 = LD_DP(x); x += inc_x;
|
||||
ST_DP(x3, y); y += inc_y;
|
||||
x12 = LD_DP(x); x += inc_x;
|
||||
ST_DP(x4, y); y += inc_y;
|
||||
x13 = LD_DP(x); x += inc_x;
|
||||
ST_DP(x5, y); y += inc_y;
|
||||
x14 = LD_DP(x); x += inc_x;
|
||||
ST_DP(x6, y); y += inc_y;
|
||||
x15 = LD_DP(x); x += inc_x;
|
||||
ST_DP(x7, y); y += inc_y;
|
||||
ST_DP(x8, y); y += inc_y;
|
||||
ST_DP(x9, y); y += inc_y;
|
||||
ST_DP(x10, y); y += inc_y;
|
||||
ST_DP(x11, y); y += inc_y;
|
||||
ST_DP(x12, y); y += inc_y;
|
||||
ST_DP(x13, y); y += inc_y;
|
||||
ST_DP(x14, y); y += inc_y;
|
||||
ST_DP(x15, y); y += inc_y;
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP8_INC(x, inc_x, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(x, inc_x, x0, x1, x2, x3);
|
||||
ST_DP4_INC(x0, x1, x2, x3, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(x, inc_x, x0, x1);
|
||||
ST_DP2_INC(x0, x1, y, inc_y);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, f0, f1);
|
||||
ST_GP2_INC(f0, f1, y, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,717 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2017, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
/* This will shuffle the elements in 'in' vector as (mask needed :: 01 00 11 10)
|
||||
0 1 2 3 => 2 3 0 1 */
|
||||
#define SHF_78 78
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
|
||||
FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy,
|
||||
BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i, inc_x2;
|
||||
FLOAT *px;
|
||||
FLOAT tp0, tp1, f0, f1;
|
||||
v2f64 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15;
|
||||
v2f64 d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15;
|
||||
v2f64 da_i_vec, da_i_vec_neg, da_r_vec;
|
||||
|
||||
px = x;
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
if ((0.0 == da_r) && (0.0 == da_i))
|
||||
{
|
||||
v2f64 zero_v = __msa_cast_to_vector_double(0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, 2);
|
||||
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, 2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
ST_DP2_INC(zero_v, zero_v, x, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
ST_DP(zero_v, x);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (0.0 == da_r)
|
||||
{
|
||||
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
|
||||
da_i_vec_neg = -da_i_vec;
|
||||
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
|
||||
|
||||
if (n > 15)
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 32 + 16;
|
||||
|
||||
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
for (i = (n >> 4)- 1; i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(x_pref, 128);
|
||||
PREF_OFFSET(x_pref, 160);
|
||||
PREF_OFFSET(x_pref, 192);
|
||||
PREF_OFFSET(x_pref, 224);
|
||||
x_pref += 32;
|
||||
|
||||
x8 = LD_DP(px); px += 2;
|
||||
x0 *= da_i_vec;
|
||||
x9 = LD_DP(px); px += 2;
|
||||
x1 *= da_i_vec;
|
||||
x10 = LD_DP(px); px += 2;
|
||||
x2 *= da_i_vec;
|
||||
x11 = LD_DP(px); px += 2;
|
||||
x3 *= da_i_vec;
|
||||
x12 = LD_DP(px); px += 2;
|
||||
x4 *= da_i_vec;
|
||||
x13 = LD_DP(px); px += 2;
|
||||
x5 *= da_i_vec;
|
||||
x0 = (v2f64) __msa_shf_w((v4i32) x0, SHF_78);
|
||||
x14 = LD_DP(px); px += 2;
|
||||
x6 *= da_i_vec;
|
||||
x1 = (v2f64) __msa_shf_w((v4i32) x1, SHF_78);
|
||||
x15 = LD_DP(px); px += 2;
|
||||
x7 *= da_i_vec;
|
||||
x2 = (v2f64) __msa_shf_w((v4i32) x2, SHF_78);
|
||||
x8 *= da_i_vec;
|
||||
x3 = (v2f64) __msa_shf_w((v4i32) x3, SHF_78);
|
||||
ST_DP(x0, x); x += 2;
|
||||
x9 *= da_i_vec;
|
||||
x4 = (v2f64) __msa_shf_w((v4i32) x4, SHF_78);
|
||||
ST_DP(x1, x); x += 2;
|
||||
x10 *= da_i_vec;
|
||||
x5 = (v2f64) __msa_shf_w((v4i32) x5, SHF_78);
|
||||
ST_DP(x2, x); x += 2;
|
||||
x11 *= da_i_vec;
|
||||
x6 = (v2f64) __msa_shf_w((v4i32) x6, SHF_78);
|
||||
ST_DP(x3, x); x += 2;
|
||||
x12 *= da_i_vec;
|
||||
x7 = (v2f64) __msa_shf_w((v4i32) x7, SHF_78);
|
||||
ST_DP(x4, x); x += 2;
|
||||
x13 *= da_i_vec;
|
||||
x8 = (v2f64) __msa_shf_w((v4i32) x8, SHF_78);
|
||||
ST_DP(x5, x); x += 2;
|
||||
x14 *= da_i_vec;
|
||||
x9 = (v2f64) __msa_shf_w((v4i32) x9, SHF_78);
|
||||
ST_DP(x6, x); x += 2;
|
||||
x15 *= da_i_vec;
|
||||
x10 = (v2f64) __msa_shf_w((v4i32) x10, SHF_78);
|
||||
ST_DP(x7, x); x += 2;
|
||||
x11 = (v2f64) __msa_shf_w((v4i32) x11, SHF_78);
|
||||
ST_DP(x8, x); x += 2;
|
||||
x0 = LD_DP(px); px += 2;
|
||||
x12 = (v2f64) __msa_shf_w((v4i32) x12, SHF_78);
|
||||
ST_DP(x9, x); x += 2;
|
||||
x1 = LD_DP(px); px += 2;
|
||||
x13 = (v2f64) __msa_shf_w((v4i32) x13, SHF_78);
|
||||
ST_DP(x10, x); x += 2;
|
||||
x2 = LD_DP(px); px += 2;
|
||||
x14 = (v2f64) __msa_shf_w((v4i32) x14, SHF_78);
|
||||
ST_DP(x11, x); x += 2;
|
||||
x3 = LD_DP(px); px += 2;
|
||||
x15 = (v2f64) __msa_shf_w((v4i32) x15, SHF_78);
|
||||
ST_DP(x12, x); x += 2;
|
||||
x4 = LD_DP(px); px += 2;
|
||||
ST_DP(x13, x); x += 2;
|
||||
x5 = LD_DP(px); px += 2;
|
||||
ST_DP(x14, x); x += 2;
|
||||
x6 = LD_DP(px); px += 2;
|
||||
ST_DP(x15, x); x += 2;
|
||||
x7 = LD_DP(px); px += 2;
|
||||
}
|
||||
|
||||
LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
x0, x1, x2, x3);
|
||||
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
|
||||
x4, x5, x6, x7);
|
||||
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
|
||||
x8, x9, x10, x11);
|
||||
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
|
||||
x12, x13, x14, x15);
|
||||
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
|
||||
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
|
||||
SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
|
||||
SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
|
||||
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
|
||||
x12, x13, x14, x15, x, 2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
x0, x1, x2, x3);
|
||||
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
|
||||
x4, x5, x6, x7);
|
||||
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
|
||||
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(px, 2, x0, x1, x2, x3);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
x0, x1, x2, x3);
|
||||
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
|
||||
ST_DP4_INC(x0, x1, x2, x3, x, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(px, 2, x0, x1);
|
||||
MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
|
||||
SHF_W2_DP(x0, x1, x0, x1, SHF_78);
|
||||
ST_DP2_INC(x0, x1, x, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(px, 1, f0, f1);
|
||||
MUL2(f0, da_i, f1, -da_i, f0, f1);
|
||||
ST_GP2_INC(f1, f0, x, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (0.0 == da_i)
|
||||
{
|
||||
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
|
||||
|
||||
if (n > 15)
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 32 + 16;
|
||||
|
||||
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
for (i = (n >> 4)- 1; i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(x_pref, 128);
|
||||
PREF_OFFSET(x_pref, 160);
|
||||
PREF_OFFSET(x_pref, 192);
|
||||
PREF_OFFSET(x_pref, 224);
|
||||
x_pref += 32;
|
||||
|
||||
x8 = LD_DP(px); px += 2;
|
||||
x0 *= da_r_vec;
|
||||
x9 = LD_DP(px); px += 2;
|
||||
x1 *= da_r_vec;
|
||||
x10 = LD_DP(px); px += 2;
|
||||
x2 *= da_r_vec;
|
||||
x11 = LD_DP(px); px += 2;
|
||||
x3 *= da_r_vec;
|
||||
x12 = LD_DP(px); px += 2;
|
||||
x4 *= da_r_vec;
|
||||
x13 = LD_DP(px); px += 2;
|
||||
x5 *= da_r_vec;
|
||||
ST_DP(x0, x); x += 2;
|
||||
x14 = LD_DP(px); px += 2;
|
||||
x6 *= da_r_vec;
|
||||
ST_DP(x1, x); x += 2;
|
||||
x15 = LD_DP(px); px += 2;
|
||||
x7 *= da_r_vec;
|
||||
ST_DP(x2, x); x += 2;
|
||||
x8 *= da_r_vec;
|
||||
ST_DP(x3, x); x += 2;
|
||||
x9 *= da_r_vec;
|
||||
ST_DP(x4, x); x += 2;
|
||||
x10 *= da_r_vec;
|
||||
ST_DP(x5, x); x += 2;
|
||||
x11 *= da_r_vec;
|
||||
ST_DP(x6, x); x += 2;
|
||||
x12 *= da_r_vec;
|
||||
ST_DP(x7, x); x += 2;
|
||||
x13 *= da_r_vec;
|
||||
ST_DP(x8, x); x += 2;
|
||||
x0 = LD_DP(px); px += 2;
|
||||
x14 *= da_r_vec;
|
||||
ST_DP(x9, x); x += 2;
|
||||
x1 = LD_DP(px); px += 2;
|
||||
x15 *= da_r_vec;
|
||||
ST_DP(x10, x); x += 2;
|
||||
x2 = LD_DP(px); px += 2;
|
||||
ST_DP(x11, x); x += 2;
|
||||
x3 = LD_DP(px); px += 2;
|
||||
ST_DP(x12, x); x += 2;
|
||||
x4 = LD_DP(px); px += 2;
|
||||
ST_DP(x13, x); x += 2;
|
||||
x5 = LD_DP(px); px += 2;
|
||||
ST_DP(x14, x); x += 2;
|
||||
x6 = LD_DP(px); px += 2;
|
||||
ST_DP(x15, x); x += 2;
|
||||
x7 = LD_DP(px); px += 2;
|
||||
}
|
||||
|
||||
LD_DP8_INC(px, 2, x8, x9, x10, x11, x12, x13, x14, x15);
|
||||
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
|
||||
x0, x1, x2, x3);
|
||||
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
|
||||
x4, x5, x6, x7);
|
||||
MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
|
||||
x8, x9, x10, x11);
|
||||
MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
|
||||
x12, x13, x14, x15);
|
||||
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
|
||||
x12, x13, x14, x15, x, 2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
|
||||
x0, x1, x2, x3);
|
||||
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
|
||||
x4, x5, x6, x7);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(px, 2, x0, x1, x2, x3);
|
||||
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
|
||||
x0, x1, x2, x3);
|
||||
ST_DP4_INC(x0, x1, x2, x3, x, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(px, 2, x0, x1);
|
||||
MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
|
||||
ST_DP2_INC(x0, x1, x, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(px, 1, f0, f1);
|
||||
MUL2(f0, da_r, f1, da_r, f0, f1);
|
||||
ST_GP2_INC(f0, f1, x, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
FLOAT *x_pref;
|
||||
BLASLONG pref_offset;
|
||||
|
||||
pref_offset = (BLASLONG)x & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offset > 0)
|
||||
{
|
||||
pref_offset = L1_DATA_LINESIZE - pref_offset;
|
||||
pref_offset = pref_offset / sizeof(FLOAT);
|
||||
}
|
||||
x_pref = x + pref_offset + 32;
|
||||
|
||||
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
|
||||
da_i_vec_neg = -da_i_vec;
|
||||
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
|
||||
|
||||
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
PREF_OFFSET(x_pref, 0);
|
||||
PREF_OFFSET(x_pref, 32);
|
||||
PREF_OFFSET(x_pref, 64);
|
||||
PREF_OFFSET(x_pref, 96);
|
||||
PREF_OFFSET(x_pref, 128);
|
||||
PREF_OFFSET(x_pref, 160);
|
||||
PREF_OFFSET(x_pref, 192);
|
||||
PREF_OFFSET(x_pref, 224);
|
||||
x_pref += 32;
|
||||
|
||||
LD_DP16_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10,
|
||||
x11, x12, x13, x14, x15);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
d0, d1, d2, d3);
|
||||
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
|
||||
d4, d5, d6, d7);
|
||||
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
|
||||
d8, d9, d10, d11);
|
||||
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
|
||||
d12, d13, d14, d15);
|
||||
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
|
||||
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
|
||||
SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
|
||||
SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
|
||||
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
|
||||
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
|
||||
FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
|
||||
FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
|
||||
ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
|
||||
d12, d13, d14, d15, x, 2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
d0, d1, d2, d3);
|
||||
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
|
||||
d4, d5, d6, d7);
|
||||
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
|
||||
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
|
||||
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
|
||||
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
|
||||
ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, 2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(px, 2, x0, x1, x2, x3);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
d0, d1, d2, d3);
|
||||
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
|
||||
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
|
||||
ST_DP4_INC(d0, d1, d2, d3, x, 2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(px, 2, x0, x1);
|
||||
MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
|
||||
SHF_W2_DP(d0, d1, d0, d1, SHF_78);
|
||||
FMADD2(x0, x1, da_r_vec, d0, d1);
|
||||
ST_DP2_INC(d0, d1, x, 2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(px, 1, f0, f1);
|
||||
|
||||
tp0 = da_r * f0;
|
||||
tp0 -= da_i * f1;
|
||||
tp1 = da_r * f1;
|
||||
tp1 += da_i * f0;
|
||||
|
||||
ST_GP2_INC(tp0, tp1, x, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
if ((0.0 == da_r) && (0.0 == da_i))
|
||||
{
|
||||
v2f64 zero_v = __msa_cast_to_vector_double(0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
|
||||
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, inc_x2);
|
||||
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
ST_DP8_INC(zero_v, zero_v, zero_v, zero_v, zero_v, zero_v,
|
||||
zero_v, zero_v, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
ST_DP4_INC(zero_v, zero_v, zero_v, zero_v, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
ST_DP2_INC(zero_v, zero_v, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
ST_DP(zero_v, x);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (0.0 == da_r)
|
||||
{
|
||||
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
|
||||
da_i_vec_neg = -da_i_vec;
|
||||
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
|
||||
x10, x11, x12, x13, x14, x15);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
x0, x1, x2, x3);
|
||||
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
|
||||
x4, x5, x6, x7);
|
||||
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
|
||||
x8, x9, x10, x11);
|
||||
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
|
||||
x12, x13, x14, x15);
|
||||
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
|
||||
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
|
||||
SHF_W4_DP(x8, x9, x10, x11, x8, x9, x10, x11, SHF_78);
|
||||
SHF_W4_DP(x12, x13, x14, x15, x12, x13, x14, x15, SHF_78);
|
||||
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
|
||||
x12, x13, x14, x15, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
x0, x1, x2, x3);
|
||||
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
|
||||
x4, x5, x6, x7);
|
||||
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
|
||||
SHF_W4_DP(x4, x5, x6, x7, x4, x5, x6, x7, SHF_78);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
x0, x1, x2, x3);
|
||||
SHF_W4_DP(x0, x1, x2, x3, x0, x1, x2, x3, SHF_78);
|
||||
ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(px, inc_x2, x0, x1);
|
||||
MUL2(x0, da_i_vec, x1, da_i_vec, x0, x1);
|
||||
SHF_W2_DP(x0, x1, x0, x1, SHF_78);
|
||||
ST_DP2_INC(x0, x1, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(px, 1, f0, f1);
|
||||
MUL2(f0, da_i, f1, -da_i, f0, f1);
|
||||
ST_GP2_INC(f1, f0, x, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else if (0.0 == da_i)
|
||||
{
|
||||
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
|
||||
x10, x11, x12, x13, x14, x15);
|
||||
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
|
||||
x0, x1, x2, x3);
|
||||
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
|
||||
x4, x5, x6, x7);
|
||||
MUL4(x8, da_r_vec, x9, da_r_vec, x10, da_r_vec, x11, da_r_vec,
|
||||
x8, x9, x10, x11);
|
||||
MUL4(x12, da_r_vec, x13, da_r_vec, x14, da_r_vec, x15, da_r_vec,
|
||||
x12, x13, x14, x15);
|
||||
ST_DP16_INC(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11,
|
||||
x12, x13, x14, x15, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
|
||||
x0, x1, x2, x3);
|
||||
MUL4(x4, da_r_vec, x5, da_r_vec, x6, da_r_vec, x7, da_r_vec,
|
||||
x4, x5, x6, x7);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
|
||||
MUL4(x0, da_r_vec, x1, da_r_vec, x2, da_r_vec, x3, da_r_vec,
|
||||
x0, x1, x2, x3);
|
||||
ST_DP4_INC(x0, x1, x2, x3, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(px, inc_x2, x0, x1);
|
||||
MUL2(x0, da_r_vec, x1, da_r_vec, x0, x1);
|
||||
ST_DP2_INC(x0, x1, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(px, 1, f0, f1);
|
||||
MUL2(f0, da_r, f1, da_r, f0, f1);
|
||||
ST_GP2_INC(f0, f1, x, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
da_i_vec = COPY_DOUBLE_TO_VECTOR(da_i);
|
||||
da_i_vec_neg = -da_i_vec;
|
||||
da_i_vec = (v2f64) __msa_ilvev_d((v2i64) da_i_vec_neg, (v2i64) da_i_vec);
|
||||
|
||||
da_r_vec = COPY_DOUBLE_TO_VECTOR(da_r);
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_DP16_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7, x8, x9,
|
||||
x10, x11, x12, x13, x14, x15);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
d0, d1, d2, d3);
|
||||
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
|
||||
d4, d5, d6, d7);
|
||||
MUL4(x8, da_i_vec, x9, da_i_vec, x10, da_i_vec, x11, da_i_vec,
|
||||
d8, d9, d10, d11);
|
||||
MUL4(x12, da_i_vec, x13, da_i_vec, x14, da_i_vec, x15, da_i_vec,
|
||||
d12, d13, d14, d15);
|
||||
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
|
||||
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
|
||||
SHF_W4_DP(d8, d9, d10, d11, d8, d9, d10, d11, SHF_78);
|
||||
SHF_W4_DP(d12, d13, d14, d15, d12, d13, d14, d15, SHF_78);
|
||||
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
|
||||
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
|
||||
FMADD4(x8, x9, x10, x11, da_r_vec, d8, d9, d10, d11);
|
||||
FMADD4(x12, x13, x14, x15, da_r_vec, d12, d13, d14, d15);
|
||||
ST_DP16_INC(d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11,
|
||||
d12, d13, d14, d15, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if (n & 8)
|
||||
{
|
||||
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
d0, d1, d2, d3);
|
||||
MUL4(x4, da_i_vec, x5, da_i_vec, x6, da_i_vec, x7, da_i_vec,
|
||||
d4, d5, d6, d7);
|
||||
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
|
||||
SHF_W4_DP(d4, d5, d6, d7, d4, d5, d6, d7, SHF_78);
|
||||
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
|
||||
FMADD4(x4, x5, x6, x7, da_r_vec, d4, d5, d6, d7);
|
||||
ST_DP8_INC(d0, d1, d2, d3, d4, d5, d6, d7, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
|
||||
MUL4(x0, da_i_vec, x1, da_i_vec, x2, da_i_vec, x3, da_i_vec,
|
||||
d0, d1, d2, d3);
|
||||
SHF_W4_DP(d0, d1, d2, d3, d0, d1, d2, d3, SHF_78);
|
||||
FMADD4(x0, x1, x2, x3, da_r_vec, d0, d1, d2, d3);
|
||||
ST_DP4_INC(d0, d1, d2, d3, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(px, inc_x2, x0, x1);
|
||||
MUL2(x0, da_i_vec, x1, da_i_vec, d0, d1);
|
||||
SHF_W2_DP(d0, d1, d0, d1, SHF_78);
|
||||
FMADD2(x0, x1, da_r_vec, d0, d1);
|
||||
ST_DP2_INC(d0, d1, x, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(px, 1, f0, f1);
|
||||
|
||||
tp0 = da_r * f0;
|
||||
tp0 -= da_i * f1;
|
||||
tp1 = da_r * f1;
|
||||
tp1 += da_i * f0;
|
||||
|
||||
ST_GP2_INC(tp0, tp1, x, 1);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
|
@ -0,0 +1,238 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3,
|
||||
FLOAT dummy4, FLOAT *srcx, BLASLONG inc_x, FLOAT *srcy,
|
||||
BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
|
||||
{
|
||||
BLASLONG i, inc_x2, inc_y2, pref_offsetx, pref_offsety;
|
||||
FLOAT *px, *py;
|
||||
v2f64 x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
v2f64 y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
|
||||
if (n < 0) return (0);
|
||||
|
||||
pref_offsetx = (BLASLONG)srcx & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offsetx > 0)
|
||||
{
|
||||
pref_offsetx = L1_DATA_LINESIZE - pref_offsetx;
|
||||
pref_offsetx = pref_offsetx / sizeof(FLOAT);
|
||||
}
|
||||
|
||||
pref_offsety = (BLASLONG)srcy & (L1_DATA_LINESIZE - 1);
|
||||
if (pref_offsety > 0)
|
||||
{
|
||||
pref_offsety = L1_DATA_LINESIZE - pref_offsety;
|
||||
pref_offsety = pref_offsety / sizeof(FLOAT);
|
||||
}
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
px = srcx;
|
||||
py = srcy;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
if (n >> 3)
|
||||
{
|
||||
LD_DP8_INC(px, 2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
|
||||
for (i = (n >> 3) - 1; i--;)
|
||||
{
|
||||
PREFETCH(px + pref_offsetx + 16);
|
||||
PREFETCH(px + pref_offsetx + 20);
|
||||
PREFETCH(px + pref_offsetx + 24);
|
||||
PREFETCH(px + pref_offsetx + 28);
|
||||
|
||||
PREFETCH(py + pref_offsety + 16);
|
||||
PREFETCH(py + pref_offsety + 20);
|
||||
PREFETCH(py + pref_offsety + 24);
|
||||
PREFETCH(py + pref_offsety + 28);
|
||||
|
||||
y0 = LD_DP(py); py += 2;
|
||||
ST_DP(x0, srcy); srcy += 2;
|
||||
y1 = LD_DP(py); py += 2;
|
||||
ST_DP(x1, srcy); srcy += 2;
|
||||
y2 = LD_DP(py); py += 2;
|
||||
ST_DP(x2, srcy); srcy += 2;
|
||||
y3 = LD_DP(py); py += 2;
|
||||
ST_DP(x3, srcy); srcy += 2;
|
||||
y4 = LD_DP(py); py += 2;
|
||||
ST_DP(x4, srcy); srcy += 2;
|
||||
y5 = LD_DP(py); py += 2;
|
||||
ST_DP(x5, srcy); srcy += 2;
|
||||
y6 = LD_DP(py); py += 2;
|
||||
ST_DP(x6, srcy); srcy += 2;
|
||||
y7 = LD_DP(py); py += 2;
|
||||
ST_DP(x7, srcy); srcy += 2;
|
||||
|
||||
x0 = LD_DP(px); px += 2;
|
||||
ST_DP(y0, srcx); srcx += 2;
|
||||
x1 = LD_DP(px); px += 2;
|
||||
ST_DP(y1, srcx); srcx += 2;
|
||||
x2 = LD_DP(px); px += 2;
|
||||
ST_DP(y2, srcx); srcx += 2;
|
||||
x3 = LD_DP(px); px += 2;
|
||||
ST_DP(y3, srcx); srcx += 2;
|
||||
x4 = LD_DP(px); px += 2;
|
||||
ST_DP(y4, srcx); srcx += 2;
|
||||
x5 = LD_DP(px); px += 2;
|
||||
ST_DP(y5, srcx); srcx += 2;
|
||||
x6 = LD_DP(px); px += 2;
|
||||
ST_DP(y6, srcx); srcx += 2;
|
||||
x7 = LD_DP(px); px += 2;
|
||||
ST_DP(y7, srcx); srcx += 2;
|
||||
}
|
||||
|
||||
LD_DP8_INC(py, 2, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, 2);
|
||||
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, 2);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if ((n & 4) && (n & 2) && (n & 1))
|
||||
{
|
||||
LD_DP7_INC(px, 2, x0, x1, x2, x3, x4, x5, x6);
|
||||
LD_DP7_INC(py, 2, y0, y1, y2, y3, y4, y5, y6);
|
||||
ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, 2);
|
||||
ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, 2);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP6_INC(px, 2, x0, x1, x2, x3, x4, x5);
|
||||
LD_DP6_INC(py, 2, y0, y1, y2, y3, y4, y5);
|
||||
ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, 2);
|
||||
ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, 2);
|
||||
}
|
||||
else if ((n & 4) && (n & 1))
|
||||
{
|
||||
LD_DP5_INC(px, 2, x0, x1, x2, x3, x4);
|
||||
LD_DP5_INC(py, 2, y0, y1, y2, y3, y4);
|
||||
ST_DP5_INC(x0, x1, x2, x3, x4, srcy, 2);
|
||||
ST_DP5_INC(y0, y1, y2, y3, y4, srcx, 2);
|
||||
}
|
||||
else if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_DP3_INC(px, 2, x0, x1, x2);
|
||||
LD_DP3_INC(py, 2, y0, y1, y2);
|
||||
ST_DP3_INC(x0, x1, x2, srcy, 2);
|
||||
ST_DP3_INC(y0, y1, y2, srcx, 2);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(px, 2, x0, x1, x2, x3);
|
||||
LD_DP4_INC(py, 2, y0, y1, y2, y3);
|
||||
ST_DP4_INC(x0, x1, x2, x3, srcy, 2);
|
||||
ST_DP4_INC(y0, y1, y2, y3, srcx, 2);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(px, 2, x0, x1);
|
||||
LD_DP2_INC(py, 2, y0, y1);
|
||||
ST_DP2_INC(x0, x1, srcy, 2);
|
||||
ST_DP2_INC(y0, y1, srcx, 2);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = LD_DP(px);
|
||||
y0 = LD_DP(py);
|
||||
ST_DP(y0, srcx);
|
||||
ST_DP(x0, srcy);
|
||||
}
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP8_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6, x7);
|
||||
LD_DP8_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6, y7);
|
||||
ST_DP8_INC(x0, x1, x2, x3, x4, x5, x6, x7, srcy, inc_y2);
|
||||
ST_DP8_INC(y0, y1, y2, y3, y4, y5, y6, y7, srcx, inc_x2);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if ((n & 4) && (n & 2) && (n & 1))
|
||||
{
|
||||
LD_DP7_INC(px, inc_x2, x0, x1, x2, x3, x4, x5, x6);
|
||||
LD_DP7_INC(py, inc_y2, y0, y1, y2, y3, y4, y5, y6);
|
||||
ST_DP7_INC(x0, x1, x2, x3, x4, x5, x6, srcy, inc_y2);
|
||||
ST_DP7_INC(y0, y1, y2, y3, y4, y5, y6, srcx, inc_x2);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP6_INC(px, inc_x2, x0, x1, x2, x3, x4, x5);
|
||||
LD_DP6_INC(py, inc_y2, y0, y1, y2, y3, y4, y5);
|
||||
ST_DP6_INC(x0, x1, x2, x3, x4, x5, srcy, inc_y2);
|
||||
ST_DP6_INC(y0, y1, y2, y3, y4, y5, srcx, inc_x2);
|
||||
}
|
||||
else if ((n & 4) && (n & 1))
|
||||
{
|
||||
LD_DP5_INC(px, inc_x2, x0, x1, x2, x3, x4);
|
||||
LD_DP5_INC(py, inc_y2, y0, y1, y2, y3, y4);
|
||||
ST_DP5_INC(x0, x1, x2, x3, x4, srcy, inc_y2);
|
||||
ST_DP5_INC(y0, y1, y2, y3, y4, srcx, inc_x2);
|
||||
}
|
||||
else if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_DP3_INC(px, inc_x2, x0, x1, x2);
|
||||
LD_DP3_INC(py, inc_y2, y0, y1, y2);
|
||||
ST_DP3_INC(x0, x1, x2, srcy, inc_y2);
|
||||
ST_DP3_INC(y0, y1, y2, srcx, inc_x2);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(px, inc_x2, x0, x1, x2, x3);
|
||||
LD_DP4_INC(py, inc_y2, y0, y1, y2, y3);
|
||||
ST_DP4_INC(x0, x1, x2, x3, srcy, inc_y2);
|
||||
ST_DP4_INC(y0, y1, y2, y3, srcx, inc_x2);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(px, inc_x2, x0, x1);
|
||||
LD_DP2_INC(py, inc_y2, y0, y1);
|
||||
ST_DP2_INC(x0, x1, srcy, inc_y2);
|
||||
ST_DP2_INC(y0, y1, srcx, inc_x2);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = LD_DP(px);
|
||||
y0 = LD_DP(py);
|
||||
ST_DP(y0, srcx);
|
||||
ST_DP(x0, srcy);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return (0);
|
||||
}
|
Loading…
Reference in New Issue