Merge pull request #927 from sva-img/develop
Added MSA optimization for GEMV_N, GEMV_T, ASUM, DOT functions
This commit is contained in:
commit
27b5211ccd
|
@ -529,7 +529,7 @@ CCOMMON_OPT += -mmsa
|
|||
FCOMMON_OPT += -mmsa
|
||||
endif
|
||||
|
||||
ifeq ($(CORE), I6400)
|
||||
ifneq ($(filter $(CORE), I6400 P6600),)
|
||||
CCOMMON_OPT += -mmsa
|
||||
FCOMMON_OPT += -mmsa
|
||||
endif
|
||||
|
|
|
@ -61,6 +61,7 @@ SICORTEX
|
|||
LOONGSON3A
|
||||
LOONGSON3B
|
||||
I6400
|
||||
P6600
|
||||
|
||||
5.IA64 CPU:
|
||||
ITANIUM2
|
||||
|
|
|
@ -75,13 +75,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define CPU_LOONGSON3A 2
|
||||
#define CPU_LOONGSON3B 3
|
||||
#define CPU_I6400 4
|
||||
#define CPU_P6600 5
|
||||
|
||||
static char *cpuname[] = {
|
||||
"UNKOWN",
|
||||
"SICORTEX",
|
||||
"LOONGSON3A",
|
||||
"LOONGSON3B",
|
||||
"I6400"
|
||||
"I6400",
|
||||
"P6600"
|
||||
};
|
||||
|
||||
int detect(void){
|
||||
|
@ -161,6 +163,8 @@ void get_subarchitecture(void){
|
|||
printf("LOONGSON3B");
|
||||
}else if(detect()==CPU_I6400){
|
||||
printf("I6400");
|
||||
}else if(detect()==CPU_P6600){
|
||||
printf("P6600");
|
||||
}else{
|
||||
printf("SICORTEX");
|
||||
}
|
||||
|
@ -198,6 +202,15 @@ void get_cpuconfig(void){
|
|||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}else if(detect()==CPU_P6600){
|
||||
printf("#define P6600\n");
|
||||
printf("#define L1_DATA_SIZE 65536\n");
|
||||
printf("#define L1_DATA_LINESIZE 32\n");
|
||||
printf("#define L2_SIZE 1048576\n");
|
||||
printf("#define L2_LINESIZE 32\n");
|
||||
printf("#define DTB_DEFAULT_ENTRIES 64\n");
|
||||
printf("#define DTB_SIZE 4096\n");
|
||||
printf("#define L2_ASSOCIATIVE 8\n");
|
||||
}else{
|
||||
printf("#define SICORTEX\n");
|
||||
printf("#define L1_DATA_SIZE 32768\n");
|
||||
|
@ -217,6 +230,8 @@ void get_libname(void){
|
|||
printf("loongson3b\n");
|
||||
}else if(detect()==CPU_I6400) {
|
||||
printf("i6400\n");
|
||||
}else if(detect()==CPU_P6600) {
|
||||
printf("p6600\n");
|
||||
}else{
|
||||
printf("mips64\n");
|
||||
}
|
||||
|
|
15
getarch.c
15
getarch.c
|
@ -132,6 +132,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
/* #define FORCE_LOONGSON3A */
|
||||
/* #define FORCE_LOONGSON3B */
|
||||
/* #define FORCE_I6400 */
|
||||
/* #define FORCE_P6600 */
|
||||
/* #define FORCE_P5600 */
|
||||
/* #define FORCE_ITANIUM2 */
|
||||
/* #define FORCE_SPARC */
|
||||
|
@ -715,6 +716,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_P6600
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
#define SUBARCHITECTURE "P6600"
|
||||
#define SUBDIRNAME "mips64"
|
||||
#define ARCHCONFIG "-DP6600 " \
|
||||
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
|
||||
"-DL2_SIZE=1048576 -DL2_LINESIZE=32 " \
|
||||
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
|
||||
#define LIBNAME "p6600"
|
||||
#define CORENAME "P6600"
|
||||
#else
|
||||
#endif
|
||||
|
||||
#ifdef FORCE_P5600
|
||||
#define FORCE
|
||||
#define ARCHITECTURE "MIPS"
|
||||
|
|
|
@ -30,10 +30,10 @@ IDMAXKERNEL = ../mips/imax.c
|
|||
ISMINKERNEL = ../mips/imin.c
|
||||
IDMINKERNEL = ../mips/imin.c
|
||||
|
||||
SASUMKERNEL = ../mips/asum.c
|
||||
DASUMKERNEL = ../mips/asum.c
|
||||
CASUMKERNEL = ../mips/zasum.c
|
||||
ZASUMKERNEL = ../mips/zasum.c
|
||||
SASUMKERNEL = ../mips/sasum_msa.c
|
||||
DASUMKERNEL = ../mips/dasum_msa.c
|
||||
CASUMKERNEL = ../mips/casum_msa.c
|
||||
ZASUMKERNEL = ../mips/zasum_msa.c
|
||||
|
||||
SAXPYKERNEL = ../mips/axpy.c
|
||||
DAXPYKERNEL = ../mips/axpy.c
|
||||
|
@ -45,10 +45,10 @@ DCOPYKERNEL = ../mips/copy.c
|
|||
CCOPYKERNEL = ../mips/zcopy.c
|
||||
ZCOPYKERNEL = ../mips/zcopy.c
|
||||
|
||||
SDOTKERNEL = ../mips/dot.c
|
||||
DDOTKERNEL = ../mips/dot.c
|
||||
CDOTKERNEL = ../mips/zdot.c
|
||||
ZDOTKERNEL = ../mips/zdot.c
|
||||
SDOTKERNEL = ../mips/sdot_msa.c
|
||||
DDOTKERNEL = ../mips/ddot_msa.c
|
||||
CDOTKERNEL = ../mips/cdot_msa.c
|
||||
ZDOTKERNEL = ../mips/zdot_msa.c
|
||||
|
||||
SNRM2KERNEL = ../mips/nrm2.c
|
||||
DNRM2KERNEL = ../mips/nrm2.c
|
||||
|
@ -70,15 +70,15 @@ DSWAPKERNEL = ../mips/swap.c
|
|||
CSWAPKERNEL = ../mips/zswap.c
|
||||
ZSWAPKERNEL = ../mips/zswap.c
|
||||
|
||||
SGEMVNKERNEL = ../mips/gemv_n.c
|
||||
DGEMVNKERNEL = ../mips/gemv_n.c
|
||||
CGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
ZGEMVNKERNEL = ../mips/zgemv_n.c
|
||||
SGEMVNKERNEL = ../mips/sgemv_n_msa.c
|
||||
DGEMVNKERNEL = ../mips/dgemv_n_msa.c
|
||||
CGEMVNKERNEL = ../mips/cgemv_n_msa.c
|
||||
ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
|
||||
|
||||
SGEMVTKERNEL = ../mips/gemv_t.c
|
||||
DGEMVTKERNEL = ../mips/gemv_t.c
|
||||
CGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
ZGEMVTKERNEL = ../mips/zgemv_t.c
|
||||
SGEMVTKERNEL = ../mips/sgemv_t_msa.c
|
||||
DGEMVTKERNEL = ../mips/dgemv_t_msa.c
|
||||
CGEMVTKERNEL = ../mips/cgemv_t_msa.c
|
||||
ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
|
||||
|
||||
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c
|
||||
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c
|
||||
|
|
|
@ -0,0 +1,338 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec))
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i, inc_x2;
|
||||
FLOAT sumf = 0.0;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
|
||||
v4f32 zero_v = {0};
|
||||
v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (sumf);
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
if (n > 15)
|
||||
{
|
||||
n -= 16;
|
||||
|
||||
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 = AND_VEC_W(src0);
|
||||
sum_abs1 = AND_VEC_W(src1);
|
||||
sum_abs2 = AND_VEC_W(src2);
|
||||
sum_abs3 = AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
sum_abs2 = zero_v;
|
||||
sum_abs3 = zero_v;
|
||||
}
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4) && (n & 2))
|
||||
{
|
||||
LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if ((n & 8) && (n & 2))
|
||||
{
|
||||
LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_SP3_INC(x, 4, src0, src1, src2);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_SP4_INC(x, 4, src0, src1, src2, src3);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_SP2_INC(x, 4, src0, src1);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
src0 = LD_SP(x); x += 4;
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
sumf += fabsf(*(x + 0));
|
||||
sumf += fabsf(*(x + 1));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
|
||||
if (n > 8)
|
||||
{
|
||||
n -= 8;
|
||||
|
||||
LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 = AND_VEC_W(src0);
|
||||
sum_abs1 = AND_VEC_W(src1);
|
||||
sum_abs2 = AND_VEC_W(src2);
|
||||
sum_abs3 = AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
sum_abs2 = zero_v;
|
||||
sum_abs3 = zero_v;
|
||||
}
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_SP8_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if ((n & 4) && (n & 2) && (n & 1))
|
||||
{
|
||||
LD_SP7_INC(x, inc_x2, src0, src1, src2, src3, src4, src5, src6);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_SP6_INC(x, inc_x2, src0, src1, src2, src3, src4, src5);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
}
|
||||
else if ((n & 4) && (n & 1))
|
||||
{
|
||||
LD_SP5_INC(x, inc_x2, src0, src1, src2, src3, src4);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
}
|
||||
else if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_SP3_INC(x, inc_x2, src0, src1, src2);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_SP4_INC(x, inc_x2, src0, src1, src2, src3);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_SP2_INC(x, inc_x2, src0, src1);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
src0 = LD_SP(x); x += inc_x2;
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
}
|
||||
}
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0] + sum_abs0[1];
|
||||
}
|
||||
|
||||
return (sumf);
|
||||
}
|
|
@ -0,0 +1,361 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#if !defined(CONJ)
|
||||
#define OP2 +=
|
||||
#define OP3 -
|
||||
#define OP4 +
|
||||
#else
|
||||
#define OP2 -=
|
||||
#define OP3 +
|
||||
#define OP4 -
|
||||
#endif
|
||||
|
||||
#define DOT16_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i); \
|
||||
\
|
||||
dot0 += (vx1r * vy1r); \
|
||||
dot0 OPR0## = (vx1i * vy1i); \
|
||||
dot1 OPR1## = (vx1i * vy1r); \
|
||||
dot1 += (vx1r * vy1i); \
|
||||
\
|
||||
dot0 += (vx2r * vy2r); \
|
||||
dot0 OPR0## = (vx2i * vy2i); \
|
||||
dot1 OPR1## = (vx2i * vy2r); \
|
||||
dot1 += (vx2r * vy2i); \
|
||||
\
|
||||
dot0 += (vx3r * vy3r); \
|
||||
dot0 OPR0## = (vx3i * vy3i); \
|
||||
dot1 OPR1## = (vx3i * vy3r); \
|
||||
dot1 += (vx3r * vy3i);
|
||||
|
||||
#define DOT12_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i); \
|
||||
\
|
||||
dot0 += (vx1r * vy1r); \
|
||||
dot0 OPR0## = (vx1i * vy1i); \
|
||||
dot1 OPR1## = (vx1i * vy1r); \
|
||||
dot1 += (vx1r * vy1i); \
|
||||
\
|
||||
dot0 += (vx2r * vy2r); \
|
||||
dot0 OPR0## = (vx2i * vy2i); \
|
||||
dot1 OPR1## = (vx2i * vy2r); \
|
||||
dot1 += (vx2r * vy2i);
|
||||
|
||||
#define DOT8_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i); \
|
||||
\
|
||||
dot0 += (vx1r * vy1r); \
|
||||
dot0 OPR0## = (vx1i * vy1i); \
|
||||
dot1 OPR1## = (vx1i * vy1r); \
|
||||
dot1 += (vx1r * vy1i);
|
||||
|
||||
#define DOT4_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i);
|
||||
|
||||
/* return float, x,y float */
|
||||
/* cdotc - CONJ */
|
||||
/* cdotu - !CONJ */
|
||||
#ifndef _MSC_VER
|
||||
#include <complex.h>
|
||||
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
FLOAT dot[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
FLOAT x0, x1, x2, x3, x4, x5, x6, x7;
|
||||
FLOAT y0, y1, y2, y3, y4, y5, y6, y7;
|
||||
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
|
||||
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
|
||||
v4f32 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
|
||||
v4f32 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
|
||||
v4f32 dot0 = {0, 0, 0, 0};
|
||||
v4f32 dot1 = {0, 0, 0, 0};
|
||||
openblas_complex_float result;
|
||||
|
||||
dot[0] = 0.0;
|
||||
dot[1] = 0.0;
|
||||
|
||||
__real__(result) = 0.0;
|
||||
__imag__(result) = 0.0;
|
||||
|
||||
if ( n < 1 ) return(result);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
|
||||
LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
|
||||
|
||||
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
|
||||
PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
|
||||
PCKEVOD_W2_SP(vx7, vx6, vx3r, vx3i);
|
||||
|
||||
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
|
||||
PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
|
||||
PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
|
||||
PCKEVOD_W2_SP(vy7, vy6, vy3r, vy3i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT16_KERNEL(-, +);
|
||||
#else
|
||||
DOT16_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
|
||||
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
|
||||
LD_SP2_INC(x, 4, vx4, vx5);
|
||||
LD_SP2_INC(y, 4, vy4, vy5);
|
||||
|
||||
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
|
||||
PCKEVOD_W2_SP(vx5, vx4, vx2r, vx2i);
|
||||
|
||||
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
|
||||
PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
|
||||
PCKEVOD_W2_SP(vy5, vy4, vy2r, vy2i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT12_KERNEL(-, +);
|
||||
#else
|
||||
DOT12_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
|
||||
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
|
||||
|
||||
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_W2_SP(vx3, vx2, vx1r, vx1i);
|
||||
|
||||
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
|
||||
PCKEVOD_W2_SP(vy3, vy2, vy1r, vy1i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT8_KERNEL(-, +);
|
||||
#else
|
||||
DOT8_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_SP2_INC(x, 4, vx0, vx1);
|
||||
LD_SP2_INC(y, 4, vy0, vy1);
|
||||
PCKEVOD_W2_SP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_W2_SP(vy1, vy0, vy0r, vy0i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT4_KERNEL(-, +);
|
||||
#else
|
||||
DOT4_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP6_INC(x, 1, x0, x1, x2, x3, x4, x5);
|
||||
LD_GP6_INC(y, 1, y0, y1, y2, y3, y4, y5);
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
|
||||
dot[0] += ( x4 * y4 OP3 x5 * y5 );
|
||||
dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_GP4_INC(x, 1, x0, x1, x2, x3);
|
||||
LD_GP4_INC(y, 1, y0, y1, y2, y3);
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
LD_GP2_INC(x, 1, x0, x1);
|
||||
LD_GP2_INC(y, 1, y0, y1);
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
}
|
||||
}
|
||||
|
||||
dot[0] += (dot0[0] + dot0[1] + dot0[2] + dot0[3]);
|
||||
dot[1] += (dot1[0] + dot1[1] + dot1[2] + dot1[3]);
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
x0 = *x;
|
||||
x1 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x2 = *x;
|
||||
x3 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x4 = *x;
|
||||
x5 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x6 = *x;
|
||||
x7 = *(x + 1);
|
||||
x += inc_x2;
|
||||
|
||||
y0 = *y;
|
||||
y1 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y2 = *y;
|
||||
y3 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y4 = *y;
|
||||
y5 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y6 = *y;
|
||||
y7 = *(y + 1);
|
||||
y += inc_y2;
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
|
||||
dot[0] += ( x4 * y4 OP3 x5 * y5 );
|
||||
dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
|
||||
|
||||
dot[0] += ( x6 * y6 OP3 x7 * y7 );
|
||||
dot[1] OP2 ( x7 * y6 OP4 x6 * y7 );
|
||||
}
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
x0 = *x;
|
||||
x1 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x2 = *x;
|
||||
x3 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x4 = *x;
|
||||
x5 = *(x + 1);
|
||||
x += inc_x2;
|
||||
|
||||
y0 = *y;
|
||||
y1 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y2 = *y;
|
||||
y3 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y4 = *y;
|
||||
y5 = *(y + 1);
|
||||
y += inc_y2;
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
|
||||
dot[0] += ( x4 * y4 OP3 x5 * y5 );
|
||||
dot[1] OP2 ( x5 * y4 OP4 x4 * y5 );
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
x0 = *x;
|
||||
x1 = *(x + 1);
|
||||
x += inc_x2;
|
||||
x2 = *x;
|
||||
x3 = *(x + 1);
|
||||
x += inc_x2;
|
||||
|
||||
y0 = *y;
|
||||
y1 = *(y + 1);
|
||||
y += inc_y2;
|
||||
y2 = *y;
|
||||
y3 = *(y + 1);
|
||||
y += inc_y2;
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
|
||||
dot[0] += ( x2 * y2 OP3 x3 * y3 );
|
||||
dot[1] OP2 ( x3 * y2 OP4 x2 * y3 );
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = *x;
|
||||
x1 = *(x + 1);
|
||||
x += inc_x2;
|
||||
|
||||
y0 = *y;
|
||||
y1 = *(y + 1);
|
||||
y += inc_y2;
|
||||
|
||||
dot[0] += ( x0 * y0 OP3 x1 * y1 );
|
||||
dot[1] OP2 ( x1 * y0 OP4 x0 * y1 );
|
||||
}
|
||||
}
|
||||
|
||||
__real__(result) = dot[0];
|
||||
__imag__(result) = dot[1];
|
||||
|
||||
return(result);
|
||||
}
|
|
@ -0,0 +1,611 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
#undef OP3
|
||||
#undef OP4
|
||||
|
||||
#if !defined(XCONJ)
|
||||
#define OP3 -=
|
||||
#define OP4 +=
|
||||
#else
|
||||
#define OP3 +=
|
||||
#define OP4 -=
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
#else
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 -=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define CGEMV_N_8x4() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
|
||||
LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
|
||||
LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
|
||||
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
|
||||
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
|
||||
PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y1r += tp0r * src1r; \
|
||||
y0r += tp1r * src2r; \
|
||||
y1r += tp1r * src3r; \
|
||||
y0r += tp2r * src4r; \
|
||||
y1r += tp2r * src5r; \
|
||||
y0r += tp3r * src6r; \
|
||||
y1r += tp3r * src7r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y1r OP0 tp0i * src1i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
y1r OP0 tp1i * src3i; \
|
||||
y0r OP0 tp2i * src4i; \
|
||||
y1r OP0 tp2i * src5i; \
|
||||
y0r OP0 tp3i * src6i; \
|
||||
y1r OP0 tp3i * src7i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y1i OP1 tp0r * src1i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
y1i OP1 tp1r * src3i; \
|
||||
y0i OP1 tp2r * src4i; \
|
||||
y1i OP1 tp2r * src5i; \
|
||||
y0i OP1 tp3r * src6i; \
|
||||
y1i OP1 tp3r * src7i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y1i OP2 tp0i * src1r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
y1i OP2 tp1i * src3r; \
|
||||
y0i OP2 tp2i * src4r; \
|
||||
y1i OP2 tp2i * src5r; \
|
||||
y0i OP2 tp3i * src6r; \
|
||||
y1i OP2 tp3i * src7r; \
|
||||
|
||||
#define CGEMV_N_4x4() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t4, t5); \
|
||||
LD_SP2(pa2 + k, 4, t8, t9); \
|
||||
LD_SP2(pa3 + k, 4, t12, t13); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y0r += tp1r * src2r; \
|
||||
y0r += tp2r * src4r; \
|
||||
y0r += tp3r * src6r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
y0r OP0 tp2i * src4i; \
|
||||
y0r OP0 tp3i * src6i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
y0i OP1 tp2r * src4i; \
|
||||
y0i OP1 tp3r * src6i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
y0i OP2 tp2i * src4r; \
|
||||
y0i OP2 tp3i * src6r; \
|
||||
|
||||
#define CGEMV_N_1x4() \
|
||||
res0 = y[0 * inc_y2]; \
|
||||
res1 = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0 += temp0_r * pa0[k]; \
|
||||
res0 OP0 temp0_i * pa0[k + 1]; \
|
||||
res0 += temp1_r * pa1[k]; \
|
||||
res0 OP0 temp1_i * pa1[k + 1]; \
|
||||
res0 += temp2_r * pa2[k]; \
|
||||
res0 OP0 temp2_i * pa2[k + 1]; \
|
||||
res0 += temp3_r * pa3[k]; \
|
||||
res0 OP0 temp3_i * pa3[k + 1]; \
|
||||
\
|
||||
res1 OP1 temp0_r * pa0[k + 1]; \
|
||||
res1 OP2 temp0_i * pa0[k]; \
|
||||
res1 OP1 temp1_r * pa1[k + 1]; \
|
||||
res1 OP2 temp1_i * pa1[k]; \
|
||||
res1 OP1 temp2_r * pa2[k + 1]; \
|
||||
res1 OP2 temp2_i * pa2[k]; \
|
||||
res1 OP1 temp3_r * pa3[k + 1]; \
|
||||
res1 OP2 temp3_i * pa3[k]; \
|
||||
\
|
||||
y[0 * inc_y2] = res0; \
|
||||
y[0 * inc_y2 + 1] = res1; \
|
||||
|
||||
#define CGEMV_N_8x2() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y1r += tp0r * src1r; \
|
||||
y0r += tp1r * src2r; \
|
||||
y1r += tp1r * src3r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y1r OP0 tp0i * src1i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
y1r OP0 tp1i * src3i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y1i OP1 tp0r * src1i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
y1i OP1 tp1r * src3i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y1i OP2 tp0i * src1r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
y1i OP2 tp1i * src3r; \
|
||||
|
||||
#define CGEMV_N_4x2() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t4, t5); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y0r += tp1r * src2r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
|
||||
#define CGEMV_N_1x2() \
|
||||
res0 = y[0 * inc_y2]; \
|
||||
res1 = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0 += temp0_r * pa0[k]; \
|
||||
res0 OP0 temp0_i * pa0[k + 1]; \
|
||||
res0 += temp1_r * pa1[k]; \
|
||||
res0 OP0 temp1_i * pa1[k + 1]; \
|
||||
\
|
||||
res1 OP1 temp0_r * pa0[k + 1]; \
|
||||
res1 OP2 temp0_i * pa0[k]; \
|
||||
res1 OP1 temp1_r * pa1[k + 1]; \
|
||||
res1 OP2 temp1_i * pa1[k]; \
|
||||
\
|
||||
y[0 * inc_y2] = res0; \
|
||||
y[0 * inc_y2 + 1] = res1; \
|
||||
|
||||
#define CGEMV_N_1x1() \
|
||||
res0 = y[0 * inc_y2]; \
|
||||
res1 = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0 += temp_r * pa0[k]; \
|
||||
res0 OP0 temp_i * pa0[k + 1]; \
|
||||
\
|
||||
res1 OP1 temp_r * pa0[k + 1]; \
|
||||
res1 OP2 temp_i * pa0[k]; \
|
||||
\
|
||||
y[0 * inc_y2] = res0; \
|
||||
y[0 * inc_y2 + 1] = res1; \
|
||||
|
||||
#define CLOAD_X4_SCALE_VECTOR() \
|
||||
LD_SP2(x, 4, x0, x1); \
|
||||
\
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
|
||||
\
|
||||
tp4r = alphar * x0r; \
|
||||
tp4r OP3 alphai * x0i; \
|
||||
tp4i = alphar * x0i; \
|
||||
tp4i OP4 alphai * x0r; \
|
||||
\
|
||||
SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
|
||||
SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
|
||||
|
||||
#define CLOAD_X4_SCALE_GP() \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
|
||||
\
|
||||
tp4r = alphar * x0r; \
|
||||
tp4r OP3 alphai * x0i; \
|
||||
tp4i = alphar * x0i; \
|
||||
tp4i OP4 alphai * x0r; \
|
||||
\
|
||||
SPLATI_W4_SP(tp4r, tp0r, tp1r, tp2r, tp3r); \
|
||||
SPLATI_W4_SP(tp4i, tp0i, tp1i, tp2i, tp3i); \
|
||||
|
||||
#define CLOAD_X2_SCALE_GP() \
|
||||
temp0_r = alpha_r * x[0 * inc_x2]; \
|
||||
temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
|
||||
temp0_i = alpha_r * x[0 * inc_x2 + 1]; \
|
||||
temp0_i OP4 alpha_i * x[0 * inc_x2]; \
|
||||
\
|
||||
temp1_r = alpha_r * x[1 * inc_x2]; \
|
||||
temp1_r OP3 alpha_i * x[1 * inc_x2 + 1]; \
|
||||
temp1_i = alpha_r * x[1 * inc_x2 + 1]; \
|
||||
temp1_i OP4 alpha_i * x[1 * inc_x2]; \
|
||||
\
|
||||
tp0r = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_r); \
|
||||
tp0i = (v4f32) COPY_FLOAT_TO_VECTOR(temp0_i); \
|
||||
tp1r = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_r); \
|
||||
tp1i = (v4f32) COPY_FLOAT_TO_VECTOR(temp1_i); \
|
||||
|
||||
#define CLOAD_X1_SCALE_GP() \
|
||||
temp_r = alpha_r * x[0 * inc_x2]; \
|
||||
temp_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
|
||||
temp_i = alpha_r * x[0 * inc_x2 + 1]; \
|
||||
temp_i OP4 alpha_i * x[0 * inc_x2]; \
|
||||
|
||||
#define CLOAD_Y8_VECTOR() \
|
||||
LD_SP4(y, 4, y0, y1, y2, y3); \
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
|
||||
PCKEVOD_W2_SP(y3, y2, y1r, y1i); \
|
||||
|
||||
#define CLOAD_Y4_VECTOR() \
|
||||
LD_SP2(y, 4, y0, y1); \
|
||||
PCKEVOD_W2_SP(y1, y0, y0r, y0i); \
|
||||
|
||||
#define CSTORE_Y8_VECTOR() \
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1); \
|
||||
ILVRL_W2_SP(y1i, y1r, y2, y3); \
|
||||
ST_SP4(y0, y1, y2, y3, y, 4); \
|
||||
|
||||
#define CSTORE_Y4_VECTOR() \
|
||||
ILVRL_W2_SP(y0i, y0r, y0, y1); \
|
||||
ST_SP2(y0, y1, y, 4); \
|
||||
|
||||
#define CLOAD_Y8_GP() \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
|
||||
y1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2))); \
|
||||
y1r = (v4f32) __msa_insert_w((v4i32) y1r, 1, *((int *)(y + 5 * inc_y2))); \
|
||||
y1r = (v4f32) __msa_insert_w((v4i32) y1r, 2, *((int *)(y + 6 * inc_y2))); \
|
||||
y1r = (v4f32) __msa_insert_w((v4i32) y1r, 3, *((int *)(y + 7 * inc_y2))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
|
||||
y1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 4 * inc_y2 + 1))); \
|
||||
y1i = (v4f32) __msa_insert_w((v4i32) y1i, 1, *((int *)(y + 5 * inc_y2 + 1))); \
|
||||
y1i = (v4f32) __msa_insert_w((v4i32) y1i, 2, *((int *)(y + 6 * inc_y2 + 1))); \
|
||||
y1i = (v4f32) __msa_insert_w((v4i32) y1i, 3, *((int *)(y + 7 * inc_y2 + 1))); \
|
||||
|
||||
#define CLOAD_Y4_GP() \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 1, *((int *)(y + 1 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 2, *((int *)(y + 2 * inc_y2))); \
|
||||
y0r = (v4f32) __msa_insert_w((v4i32) y0r, 3, *((int *)(y + 3 * inc_y2))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *)(y + 0 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 1, *((int *)(y + 1 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 2, *((int *)(y + 2 * inc_y2 + 1))); \
|
||||
y0i = (v4f32) __msa_insert_w((v4i32) y0i, 3, *((int *)(y + 3 * inc_y2 + 1))); \
|
||||
|
||||
#define CSTORE_Y8_GP() \
|
||||
*((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
|
||||
*((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
|
||||
*((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
|
||||
*((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
|
||||
*((int *)(y + 4 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 0); \
|
||||
*((int *)(y + 5 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 1); \
|
||||
*((int *)(y + 6 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 2); \
|
||||
*((int *)(y + 7 * inc_y2)) = __msa_copy_s_w((v4i32) y1r, 3); \
|
||||
*((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
|
||||
*((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
|
||||
*((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
|
||||
*((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
|
||||
*((int *)(y + 4 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 0); \
|
||||
*((int *)(y + 5 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 1); \
|
||||
*((int *)(y + 6 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 2); \
|
||||
*((int *)(y + 7 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y1i, 3); \
|
||||
|
||||
#define CSTORE_Y4_GP() \
|
||||
*((int *)(y + 0 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 0); \
|
||||
*((int *)(y + 1 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 1); \
|
||||
*((int *)(y + 2 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 2); \
|
||||
*((int *)(y + 3 * inc_y2)) = __msa_copy_s_w((v4i32) y0r, 3); \
|
||||
*((int *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 0); \
|
||||
*((int *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 1); \
|
||||
*((int *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 2); \
|
||||
*((int *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_w((v4i32) y0i, 3); \
|
||||
|
||||
#define CGEMV_N_MSA() \
|
||||
for (j = (n >> 2); j--;) \
|
||||
{ \
|
||||
CLOAD_X4_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_Y8() \
|
||||
CGEMV_N_8x4(); \
|
||||
CSTORE_Y8(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
y += inc_y2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_Y4(); \
|
||||
CGEMV_N_4x4(); \
|
||||
CSTORE_Y4(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
y += inc_y2 * 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0_r = tp4r[0]; \
|
||||
temp1_r = tp4r[1]; \
|
||||
temp2_r = tp4r[2]; \
|
||||
temp3_r = tp4r[3]; \
|
||||
\
|
||||
temp0_i = tp4i[0]; \
|
||||
temp1_i = tp4i[1]; \
|
||||
temp2_i = tp4i[2]; \
|
||||
temp3_i = tp4i[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_N_1x4(); \
|
||||
\
|
||||
k += 2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
pa0 += 4 * lda2; \
|
||||
pa1 += 4 * lda2; \
|
||||
pa2 += 4 * lda2; \
|
||||
pa3 += 4 * lda2; \
|
||||
\
|
||||
x += 4 * inc_x2; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
CLOAD_X2_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_Y8(); \
|
||||
CGEMV_N_8x2(); \
|
||||
CSTORE_Y8(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
y += inc_y2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_Y4(); \
|
||||
CGEMV_N_4x2(); \
|
||||
CSTORE_Y4(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
y += inc_y2 * 4; \
|
||||
} \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_N_1x2(); \
|
||||
\
|
||||
k += 2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
\
|
||||
pa0 += 2 * lda2; \
|
||||
pa1 += 2 * lda2; \
|
||||
\
|
||||
x += 2 * inc_x2; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
CLOAD_X1_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = m; i--;) \
|
||||
{ \
|
||||
CGEMV_N_1x1(); \
|
||||
\
|
||||
k += 2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
\
|
||||
pa0 += lda2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
|
||||
BLASLONG inc_y2, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *y_org = y;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3;
|
||||
FLOAT temp_r, temp_i, res0, res1, temp0_r;
|
||||
FLOAT temp0_i, temp1_r, temp1_i, temp2_r, temp2_i, temp3_r, temp3_i;
|
||||
v4f32 alphar, alphai;
|
||||
v4f32 x0, x1, y0, y1, y2, y3, x0r, x0i, y0r, y1r, y0i, y1i;
|
||||
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
|
||||
v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
|
||||
v4f32 tp0r, tp1r, tp2r, tp3r, tp4r, tp0i, tp1i, tp2i, tp3i, tp4i;
|
||||
|
||||
lda2 = 2 * lda2;
|
||||
inc_x2 = 2 * inc_x2;
|
||||
inc_y2 = 2 * inc_y2;
|
||||
|
||||
pa0 = A;
|
||||
pa1 = A + lda2;
|
||||
pa2 = A + 2 * lda2;
|
||||
pa3 = A + 3 * lda2;
|
||||
|
||||
alphar = COPY_FLOAT_TO_VECTOR(alpha_r);
|
||||
alphai = COPY_FLOAT_TO_VECTOR(alpha_i);
|
||||
|
||||
if ((2 == inc_x2) && (2 == inc_y2))
|
||||
{
|
||||
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
|
||||
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
|
||||
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
|
||||
#define CLOAD_Y8 CLOAD_Y8_VECTOR
|
||||
#define CLOAD_Y4 CLOAD_Y4_VECTOR
|
||||
#define CSTORE_Y8 CSTORE_Y8_VECTOR
|
||||
#define CSTORE_Y4 CSTORE_Y4_VECTOR
|
||||
|
||||
CGEMV_N_MSA();
|
||||
|
||||
#undef CLOAD_X4_SCALE
|
||||
#undef CLOAD_X2_SCALE
|
||||
#undef CLOAD_X1_SCALE
|
||||
#undef CLOAD_Y8
|
||||
#undef CLOAD_Y4
|
||||
#undef CSTORE_Y8
|
||||
#undef CSTORE_Y4
|
||||
}
|
||||
else if (2 == inc_x2)
|
||||
{
|
||||
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_VECTOR
|
||||
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
|
||||
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
|
||||
#define CLOAD_Y8 CLOAD_Y8_GP
|
||||
#define CLOAD_Y4 CLOAD_Y4_GP
|
||||
#define CSTORE_Y8 CSTORE_Y8_GP
|
||||
#define CSTORE_Y4 CSTORE_Y4_GP
|
||||
|
||||
CGEMV_N_MSA();
|
||||
|
||||
#undef CLOAD_X4_SCALE
|
||||
#undef CLOAD_X2_SCALE
|
||||
#undef CLOAD_X1_SCALE
|
||||
#undef CLOAD_Y8
|
||||
#undef CLOAD_Y4
|
||||
#undef CSTORE_Y8
|
||||
#undef CSTORE_Y4
|
||||
}
|
||||
else if (2 == inc_y2)
|
||||
{
|
||||
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
|
||||
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
|
||||
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
|
||||
#define CLOAD_Y8 CLOAD_Y8_VECTOR
|
||||
#define CLOAD_Y4 CLOAD_Y4_VECTOR
|
||||
#define CSTORE_Y8 CSTORE_Y8_VECTOR
|
||||
#define CSTORE_Y4 CSTORE_Y4_VECTOR
|
||||
|
||||
CGEMV_N_MSA();
|
||||
|
||||
#undef CLOAD_X4_SCALE
|
||||
#undef CLOAD_X2_SCALE
|
||||
#undef CLOAD_X1_SCALE
|
||||
#undef CLOAD_Y8
|
||||
#undef CLOAD_Y4
|
||||
#undef CSTORE_Y8
|
||||
#undef CSTORE_Y4
|
||||
}
|
||||
else
|
||||
{
|
||||
#define CLOAD_X4_SCALE CLOAD_X4_SCALE_GP
|
||||
#define CLOAD_X2_SCALE CLOAD_X2_SCALE_GP
|
||||
#define CLOAD_X1_SCALE CLOAD_X1_SCALE_GP
|
||||
#define CLOAD_Y8 CLOAD_Y8_GP
|
||||
#define CLOAD_Y4 CLOAD_Y4_GP
|
||||
#define CSTORE_Y8 CSTORE_Y8_GP
|
||||
#define CSTORE_Y4 CSTORE_Y4_GP
|
||||
|
||||
CGEMV_N_MSA();
|
||||
|
||||
#undef CLOAD_X4_SCALE
|
||||
#undef CLOAD_X2_SCALE
|
||||
#undef CLOAD_X1_SCALE
|
||||
#undef CLOAD_Y8
|
||||
#undef CLOAD_Y4
|
||||
#undef CSTORE_Y8
|
||||
#undef CSTORE_Y4
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
#undef OP3
|
||||
#undef OP4
|
|
@ -0,0 +1,583 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
|
||||
#define CGEMV_T_8x4() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
|
||||
LD_SP4(pa2 + k, 4, t8, t9, t10, t11); \
|
||||
LD_SP4(pa3 + k, 4, t12, t13, t14, t15); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
|
||||
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_W2_SP(t11, t10, src5r, src5i); \
|
||||
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
|
||||
PCKEVOD_W2_SP(t15, t14, src7r, src7i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r += src1r * x1r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
tp0r OP0 src1i * x1i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r += src3r * x1r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
tp1r OP0 src3i * x1i; \
|
||||
\
|
||||
tp2r += src4r * x0r; \
|
||||
tp2r += src5r * x1r; \
|
||||
tp2r OP0 src4i * x0i; \
|
||||
tp2r OP0 src5i * x1i; \
|
||||
\
|
||||
tp3r += src6r * x0r; \
|
||||
tp3r += src7r * x1r; \
|
||||
tp3r OP0 src6i * x0i; \
|
||||
tp3r OP0 src7i * x1i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP1 src1r * x1i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
tp0i OP2 src1i * x1r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP1 src3r * x1i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
tp1i OP2 src3i * x1r; \
|
||||
\
|
||||
tp2i OP1 src4r * x0i; \
|
||||
tp2i OP1 src5r * x1i; \
|
||||
tp2i OP2 src4i * x0r; \
|
||||
tp2i OP2 src5i * x1r; \
|
||||
\
|
||||
tp3i OP1 src6r * x0i; \
|
||||
tp3i OP1 src7r * x1i; \
|
||||
tp3i OP2 src6i * x0r; \
|
||||
tp3i OP2 src7i * x1r; \
|
||||
|
||||
#define CGEMV_T_8x2() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
LD_SP4(pa1 + k, 4, t4, t5, t6, t7); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t7, t6, src3r, src3i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r += src1r * x1r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
tp0r OP0 src1i * x1i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r += src3r * x1r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
tp1r OP0 src3i * x1i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP1 src1r * x1i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
tp0i OP2 src1i * x1r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP1 src3r * x1i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
tp1i OP2 src3i * x1r; \
|
||||
|
||||
#define CGEMV_T_8x1() \
|
||||
LD_SP4(pa0 + k, 4, t0, t1, t2, t3); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t3, t2, src1r, src1i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r += src1r * x1r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
tp0r OP0 src1i * x1i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP1 src1r * x1i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
tp0i OP2 src1i * x1r; \
|
||||
|
||||
#define CGEMV_T_4x4() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t4, t5); \
|
||||
LD_SP2(pa2 + k, 4, t8, t9); \
|
||||
LD_SP2(pa3 + k, 4, t12, t13); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_W2_SP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_W2_SP(t13, t12, src6r, src6i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
\
|
||||
tp2r += src4r * x0r; \
|
||||
tp2r OP0 src4i * x0i; \
|
||||
\
|
||||
tp3r += src6r * x0r; \
|
||||
tp3r OP0 src6i * x0i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
\
|
||||
tp2i OP1 src4r * x0i; \
|
||||
tp2i OP2 src4i * x0r; \
|
||||
\
|
||||
tp3i OP1 src6r * x0i; \
|
||||
tp3i OP2 src6i * x0r; \
|
||||
|
||||
#define CGEMV_T_4x2() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t4, t5); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_W2_SP(t5, t4, src2r, src2i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
|
||||
#define CGEMV_T_4x1() \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
\
|
||||
PCKEVOD_W2_SP(t1, t0, src0r, src0i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
|
||||
#define CGEMV_T_1x4() \
|
||||
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
|
||||
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp1r += pa1[k + 0] * x[0 * inc_x2]; \
|
||||
temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp2r += pa2[k + 0] * x[0 * inc_x2]; \
|
||||
temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp3r += pa3[k + 0] * x[0 * inc_x2]; \
|
||||
temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
\
|
||||
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
|
||||
temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
|
||||
temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \
|
||||
temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \
|
||||
|
||||
#define CGEMV_T_1x2() \
|
||||
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
|
||||
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp1r += pa1[k + 0] * x[0 * inc_x2]; \
|
||||
temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
\
|
||||
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
|
||||
temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
|
||||
|
||||
#define CGEMV_T_1x1() \
|
||||
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
|
||||
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
\
|
||||
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
|
||||
|
||||
#define CSCALE_STORE_Y4_GP() \
|
||||
res0r = y[0 * inc_y2]; \
|
||||
res1r = y[1 * inc_y2]; \
|
||||
res2r = y[2 * inc_y2]; \
|
||||
res3r = y[3 * inc_y2]; \
|
||||
\
|
||||
res0i = y[0 * inc_y2 + 1]; \
|
||||
res1i = y[1 * inc_y2 + 1]; \
|
||||
res2i = y[2 * inc_y2 + 1]; \
|
||||
res3i = y[3 * inc_y2 + 1]; \
|
||||
\
|
||||
res0r += alphar * temp0r; \
|
||||
res0r OP0 alphai * temp0i; \
|
||||
res1r += alphar * temp1r; \
|
||||
res1r OP0 alphai * temp1i; \
|
||||
res2r += alphar * temp2r; \
|
||||
res2r OP0 alphai * temp2i; \
|
||||
res3r += alphar * temp3r; \
|
||||
res3r OP0 alphai * temp3i; \
|
||||
\
|
||||
res0i OP1 alphar * temp0i; \
|
||||
res0i OP2 alphai * temp0r; \
|
||||
res1i OP1 alphar * temp1i; \
|
||||
res1i OP2 alphai * temp1r; \
|
||||
res2i OP1 alphar * temp2i; \
|
||||
res2i OP2 alphai * temp2r; \
|
||||
res3i OP1 alphar * temp3i; \
|
||||
res3i OP2 alphai * temp3r; \
|
||||
\
|
||||
y[0 * inc_y2] = res0r; \
|
||||
y[1 * inc_y2] = res1r; \
|
||||
y[2 * inc_y2] = res2r; \
|
||||
y[3 * inc_y2] = res3r; \
|
||||
\
|
||||
y[0 * inc_y2 + 1] = res0i; \
|
||||
y[1 * inc_y2 + 1] = res1i; \
|
||||
y[2 * inc_y2 + 1] = res2i; \
|
||||
y[3 * inc_y2 + 1] = res3i; \
|
||||
|
||||
#define CSCALE_STORE_Y2_GP() \
|
||||
res0r = y[0 * inc_y2]; \
|
||||
res1r = y[1 * inc_y2]; \
|
||||
\
|
||||
res0i = y[0 * inc_y2 + 1]; \
|
||||
res1i = y[1 * inc_y2 + 1]; \
|
||||
\
|
||||
res0r += alphar * temp0r; \
|
||||
res0r OP0 alphai * temp0i; \
|
||||
res1r += alphar * temp1r; \
|
||||
res1r OP0 alphai * temp1i; \
|
||||
\
|
||||
res0i OP1 alphar * temp0i; \
|
||||
res0i OP2 alphai * temp0r; \
|
||||
res1i OP1 alphar * temp1i; \
|
||||
res1i OP2 alphai * temp1r; \
|
||||
\
|
||||
y[0 * inc_y2] = res0r; \
|
||||
y[1 * inc_y2] = res1r; \
|
||||
\
|
||||
y[0 * inc_y2 + 1] = res0i; \
|
||||
y[1 * inc_y2 + 1] = res1i; \
|
||||
|
||||
|
||||
#define CSCALE_STORE_Y1_GP() \
|
||||
res0r = y[0 * inc_y2]; \
|
||||
res0i = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0r += alphar * temp0r; \
|
||||
res0r OP0 alphai * temp0i; \
|
||||
\
|
||||
res0i OP1 alphar * temp0i; \
|
||||
res0i OP2 alphai * temp0r; \
|
||||
\
|
||||
y[0 * inc_y2] = res0r; \
|
||||
y[0 * inc_y2 + 1] = res0i; \
|
||||
|
||||
#define CLOAD_X8_VECTOR() \
|
||||
LD_SP4(x, 4, x0, x1, x2, x3); \
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
|
||||
PCKEVOD_W2_SP(x3, x2, x1r, x1i); \
|
||||
|
||||
#define CLOAD_X4_VECTOR() \
|
||||
LD_SP2(x, 4, x0, x1); \
|
||||
PCKEVOD_W2_SP(x1, x0, x0r, x0i); \
|
||||
|
||||
#define CLOAD_X8_GP() \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
|
||||
x1r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2))); \
|
||||
x1r = (v4f32) __msa_insert_w((v4i32) x1r, 1, *((int *) (x + 5 * inc_x2))); \
|
||||
x1r = (v4f32) __msa_insert_w((v4i32) x1r, 2, *((int *) (x + 6 * inc_x2))); \
|
||||
x1r = (v4f32) __msa_insert_w((v4i32) x1r, 3, *((int *) (x + 7 * inc_x2))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
|
||||
x1i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 4 * inc_x2 + 1))); \
|
||||
x1i = (v4f32) __msa_insert_w((v4i32) x1i, 1, *((int *) (x + 5 * inc_x2 + 1))); \
|
||||
x1i = (v4f32) __msa_insert_w((v4i32) x1i, 2, *((int *) (x + 6 * inc_x2 + 1))); \
|
||||
x1i = (v4f32) __msa_insert_w((v4i32) x1i, 3, *((int *) (x + 7 * inc_x2 + 1))); \
|
||||
|
||||
#define CLOAD_X4_GP() \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 1, *((int *) (x + 1 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 2, *((int *) (x + 2 * inc_x2))); \
|
||||
x0r = (v4f32) __msa_insert_w((v4i32) x0r, 3, *((int *) (x + 3 * inc_x2))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) tp0r, 0, *((int *) (x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 1, *((int *) (x + 1 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 2, *((int *) (x + 2 * inc_x2 + 1))); \
|
||||
x0i = (v4f32) __msa_insert_w((v4i32) x0i, 3, *((int *) (x + 3 * inc_x2 + 1))); \
|
||||
|
||||
#define CGEMV_T_MSA() \
|
||||
for (j = (n >> 2); j--;) \
|
||||
{ \
|
||||
tp0r = tp1r = tp2r = tp3r = zero; \
|
||||
tp0i = tp1i = tp2i = tp3i = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_X8() \
|
||||
CGEMV_T_8x4(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
x += inc_x2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_X4(); \
|
||||
\
|
||||
CGEMV_T_4x4(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
x += inc_x2 * 4; \
|
||||
} \
|
||||
\
|
||||
TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp2r, tp3r, \
|
||||
tp0r, tp1r, tp2r, tp3r); \
|
||||
TRANSPOSE4x4_SP_SP(tp0i, tp1i, tp2i, tp3i, \
|
||||
tp0i, tp1i, tp2i, tp3i); \
|
||||
\
|
||||
tp0r += tp1r; \
|
||||
tp0r += tp2r; \
|
||||
tp0r += tp3r; \
|
||||
tp0i += tp1i; \
|
||||
tp0i += tp2i; \
|
||||
tp0i += tp3i; \
|
||||
\
|
||||
temp0r = tp0r[0]; \
|
||||
temp1r = tp0r[1]; \
|
||||
temp2r = tp0r[2]; \
|
||||
temp3r = tp0r[3]; \
|
||||
temp0i = tp0i[0]; \
|
||||
temp1i = tp0i[1]; \
|
||||
temp2i = tp0i[2]; \
|
||||
temp3i = tp0i[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_T_1x4(); \
|
||||
\
|
||||
k += 2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
\
|
||||
CSCALE_STORE_Y4_GP(); \
|
||||
\
|
||||
pa0 += 4 * lda2; \
|
||||
pa1 += 4 * lda2; \
|
||||
pa2 += 4 * lda2; \
|
||||
pa3 += 4 * lda2; \
|
||||
y += 4 * inc_y2; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
tp0r = tp1r = zero; \
|
||||
tp0i = tp1i = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_X8(); \
|
||||
\
|
||||
CGEMV_T_8x2(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
x += inc_x2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_X4(); \
|
||||
\
|
||||
CGEMV_T_4x2(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
x += inc_x2 * 4; \
|
||||
} \
|
||||
\
|
||||
TRANSPOSE4x4_SP_SP(tp0r, tp1r, tp0i, tp1i, \
|
||||
tp0r, tp1r, tp0i, tp1i); \
|
||||
\
|
||||
tp0r += tp1r; \
|
||||
tp0r += tp0i; \
|
||||
tp0r += tp1i; \
|
||||
\
|
||||
temp0r = tp0r[0]; \
|
||||
temp1r = tp0r[1]; \
|
||||
temp0i = tp0r[2]; \
|
||||
temp1i = tp0r[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_T_1x2(); \
|
||||
\
|
||||
k += 2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
\
|
||||
CSCALE_STORE_Y2_GP(); \
|
||||
\
|
||||
pa0 += 2 * lda2; \
|
||||
pa1 += 2 * lda2; \
|
||||
y += 2 * inc_y2; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
tp0r = zero; \
|
||||
tp0i = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
CLOAD_X8(); \
|
||||
\
|
||||
CGEMV_T_8x1(); \
|
||||
\
|
||||
k += 2 * 8; \
|
||||
x += inc_x2 * 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
CLOAD_X4(); \
|
||||
\
|
||||
CGEMV_T_4x1(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
x += inc_x2 * 4; \
|
||||
} \
|
||||
\
|
||||
ILVRL_W2_SP(tp0i, tp0r, t0, t1); \
|
||||
\
|
||||
t0 += t1; \
|
||||
\
|
||||
temp0r = t0[0] + t0[2]; \
|
||||
temp0i = t0[1] + t0[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
CGEMV_T_1x1(); \
|
||||
\
|
||||
k += 2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
\
|
||||
CSCALE_STORE_Y1_GP(); \
|
||||
\
|
||||
pa0 += lda2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
|
||||
FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3;
|
||||
FLOAT *srcx_org = x;
|
||||
FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i;
|
||||
FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i;
|
||||
BLASLONG inc_x2, inc_y2, lda2;
|
||||
v4f32 zero = {0};
|
||||
v4f32 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
|
||||
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v4f32 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
|
||||
v4f32 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
|
||||
v4f32 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
|
||||
|
||||
lda2 = 2 * lda;
|
||||
|
||||
pa0 = A;
|
||||
pa1 = A + lda2;
|
||||
pa2 = A + 2 * lda2;
|
||||
pa3 = A + 3 * lda2;
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
if (2 == inc_x2)
|
||||
{
|
||||
#define CLOAD_X8 CLOAD_X8_VECTOR
|
||||
#define CLOAD_X4 CLOAD_X4_VECTOR
|
||||
|
||||
CGEMV_T_MSA();
|
||||
|
||||
#undef CLOAD_X8
|
||||
#undef CLOAD_X4
|
||||
}
|
||||
else
|
||||
{
|
||||
#define CLOAD_X8 CLOAD_X8_GP
|
||||
#define CLOAD_X4 CLOAD_X4_GP
|
||||
|
||||
CGEMV_T_MSA();
|
||||
|
||||
#undef CLOAD_X8
|
||||
#undef CLOAD_X4
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
|
@ -0,0 +1,278 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec))
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT sumf = 0.0;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
|
||||
v2f64 zero_v = {0};
|
||||
v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (sumf);
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
if (n > 15)
|
||||
{
|
||||
n -= 16;
|
||||
|
||||
LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 = AND_VEC_D(src0);
|
||||
sum_abs1 = AND_VEC_D(src1);
|
||||
sum_abs2 = AND_VEC_D(src2);
|
||||
sum_abs3 = AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
sum_abs3 += AND_VEC_D(src7);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
sum_abs2 = zero_v;
|
||||
sum_abs3 = zero_v;
|
||||
}
|
||||
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_DP8_INC(x, 2, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
sum_abs3 += AND_VEC_D(src7);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP7_INC(x, 2, src0, src1, src2, src3, src4, src5, src6);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_DP6_INC(x, 2, src0, src1, src2, src3, src4, src5);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
}
|
||||
else if ((n & 8) && (n & 2))
|
||||
{
|
||||
LD_DP5_INC(x, 2, src0, src1, src2, src3, src4);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP3_INC(x, 2, src0, src1, src2);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_DP4_INC(x, 2, src0, src1, src2, src3);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(x, 2, src0, src1);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
src0 = LD_DP(x); x += 2;
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
}
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0] + sum_abs0[1];
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
sumf += fabs(*x);
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0] + sum_abs0[1];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (n > 8)
|
||||
{
|
||||
n -= 8;
|
||||
|
||||
LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 = AND_VEC_D(src0);
|
||||
sum_abs1 = AND_VEC_D(src1);
|
||||
sum_abs2 = AND_VEC_D(src2);
|
||||
sum_abs3 = AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
sum_abs3 += AND_VEC_D(src7);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
sum_abs2 = zero_v;
|
||||
sum_abs3 = zero_v;
|
||||
}
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP8_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
sum_abs3 += AND_VEC_D(src7);
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if ((n & 4) && (n & 2) && (n & 1))
|
||||
{
|
||||
LD_DP7_INC(x, inc_x, src0, src1, src2, src3, src4, src5, src6);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
sum_abs2 += AND_VEC_D(src6);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP6_INC(x, inc_x, src0, src1, src2, src3, src4, src5);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
sum_abs1 += AND_VEC_D(src5);
|
||||
}
|
||||
else if ((n & 4) && (n & 1))
|
||||
{
|
||||
LD_DP5_INC(x, inc_x, src0, src1, src2, src3, src4);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
sum_abs0 += AND_VEC_D(src4);
|
||||
}
|
||||
else if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_DP3_INC(x, inc_x, src0, src1, src2);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(x, inc_x, src0, src1, src2, src3);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
sum_abs2 += AND_VEC_D(src2);
|
||||
sum_abs3 += AND_VEC_D(src3);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(x, inc_x, src0, src1);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
sum_abs1 += AND_VEC_D(src1);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
src0 = LD_DP(x);
|
||||
|
||||
sum_abs0 += AND_VEC_D(src0);
|
||||
}
|
||||
}
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf = sum_abs0[0];
|
||||
}
|
||||
|
||||
return (sumf);
|
||||
}
|
|
@ -0,0 +1,189 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
/* return float, x,y float */
|
||||
#if defined(DSDOT)
|
||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
double dot = 0.0;
|
||||
FLOAT x0, x1, x2, x3, y0, y1, y2, y3;
|
||||
v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
|
||||
v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
|
||||
v2f64 dot0 = {0, 0};
|
||||
|
||||
if (n < 0) return (dot);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
for (i = (n >> 4); i--;)
|
||||
{
|
||||
LD_DP8_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
|
||||
LD_DP8_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
dot0 += (vy5 * vx5);
|
||||
dot0 += (vy6 * vx6);
|
||||
dot0 += (vy7 * vx7);
|
||||
}
|
||||
|
||||
if (n & 15)
|
||||
{
|
||||
if ((n & 8) && (n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP7_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
|
||||
LD_DP7_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
dot0 += (vy5 * vx5);
|
||||
dot0 += (vy6 * vx6);
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_DP6_INC(x, 2, vx0, vx1, vx2, vx3, vx4, vx5);
|
||||
LD_DP6_INC(y, 2, vy0, vy1, vy2, vy3, vy4, vy5);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
dot0 += (vy5 * vx5);
|
||||
}
|
||||
else if ((n & 8) && (n & 2))
|
||||
{
|
||||
LD_DP5_INC(x, 2, vx0, vx1, vx2, vx3, vx4);
|
||||
LD_DP5_INC(y, 2, vy0, vy1, vy2, vy3, vy4);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
}
|
||||
else if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP3_INC(x, 2, vx0, vx1, vx2);
|
||||
LD_DP3_INC(y, 2, vy0, vy1, vy2);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_DP4_INC(x, 2, vx0, vx1, vx2, vx3);
|
||||
LD_DP4_INC(y, 2, vy0, vy1, vy2, vy3);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP2_INC(x, 2, vx0, vx1);
|
||||
LD_DP2_INC(y, 2, vy0, vy1);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
vx0 = LD_DP(x); x += 2;
|
||||
vy0 = LD_DP(y); y += 2;
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
x0 = *x;
|
||||
y0 = *y;
|
||||
|
||||
dot += (y0 * x0);
|
||||
}
|
||||
}
|
||||
|
||||
dot += dot0[0];
|
||||
dot += dot0[1];
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
|
||||
LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
dot += (y2 * x2);
|
||||
dot += (y3 * x3);
|
||||
}
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP3_INC(x, inc_x, x0, x1, x2);
|
||||
LD_GP3_INC(y, inc_y, y0, y1, y2);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
dot += (y2 * x2);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, inc_x, x0, x1);
|
||||
LD_GP2_INC(y, inc_y, y0, y1);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = *x;
|
||||
y0 = *y;
|
||||
|
||||
dot += (y0 * x0);
|
||||
}
|
||||
}
|
||||
|
||||
return (dot);
|
||||
}
|
|
@ -0,0 +1,577 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define DGEMV_N_8x8() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
|
||||
LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
|
||||
LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
|
||||
LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
y2 += tp0 * t2; \
|
||||
y3 += tp0 * t3; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
y2 += tp1 * t6; \
|
||||
y3 += tp1 * t7; \
|
||||
\
|
||||
y0 += tp2 * t8; \
|
||||
y1 += tp2 * t9; \
|
||||
y2 += tp2 * t10; \
|
||||
y3 += tp2 * t11; \
|
||||
\
|
||||
y0 += tp3 * t12; \
|
||||
y1 += tp3 * t13; \
|
||||
y2 += tp3 * t14; \
|
||||
y3 += tp3 * t15; \
|
||||
\
|
||||
y0 += tp4 * t16; \
|
||||
y1 += tp4 * t17; \
|
||||
y2 += tp4 * t18; \
|
||||
y3 += tp4 * t19; \
|
||||
\
|
||||
y0 += tp5 * t20; \
|
||||
y1 += tp5 * t21; \
|
||||
y2 += tp5 * t22; \
|
||||
y3 += tp5 * t23; \
|
||||
\
|
||||
y0 += tp6 * t24; \
|
||||
y1 += tp6 * t25; \
|
||||
y2 += tp6 * t26; \
|
||||
y3 += tp6 * t27; \
|
||||
\
|
||||
y0 += tp7 * t28; \
|
||||
y1 += tp7 * t29; \
|
||||
y2 += tp7 * t30; \
|
||||
y3 += tp7 * t31; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_4x8() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
LD_DP2(pa4 + k, 2, t16, t17); \
|
||||
LD_DP2(pa5 + k, 2, t20, t21); \
|
||||
LD_DP2(pa6 + k, 2, t24, t25); \
|
||||
LD_DP2(pa7 + k, 2, t28, t29); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
\
|
||||
y0 += tp2 * t8; \
|
||||
y1 += tp2 * t9; \
|
||||
\
|
||||
y0 += tp3 * t12; \
|
||||
y1 += tp3 * t13; \
|
||||
\
|
||||
y0 += tp4 * t16; \
|
||||
y1 += tp4 * t17; \
|
||||
\
|
||||
y0 += tp5 * t20; \
|
||||
y1 += tp5 * t21; \
|
||||
\
|
||||
y0 += tp6 * t24; \
|
||||
y1 += tp6 * t25; \
|
||||
\
|
||||
y0 += tp7 * t28; \
|
||||
y1 += tp7 * t29; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_8x4() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
y2 += tp0 * t2; \
|
||||
y3 += tp0 * t3; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
y2 += tp1 * t6; \
|
||||
y3 += tp1 * t7; \
|
||||
\
|
||||
y0 += tp2 * t8; \
|
||||
y1 += tp2 * t9; \
|
||||
y2 += tp2 * t10; \
|
||||
y3 += tp2 * t11; \
|
||||
\
|
||||
y0 += tp3 * t12; \
|
||||
y1 += tp3 * t13; \
|
||||
y2 += tp3 * t14; \
|
||||
y3 += tp3 * t15; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_4x4() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
\
|
||||
y0 += tp2 * t8; \
|
||||
y1 += tp2 * t9; \
|
||||
\
|
||||
y0 += tp3 * t12; \
|
||||
y1 += tp3 * t13; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_8x2() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
y2 += tp0 * t2; \
|
||||
y3 += tp0 * t3; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
y2 += tp1 * t6; \
|
||||
y3 += tp1 * t7; \
|
||||
}
|
||||
|
||||
#define DGEMV_N_4x2() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
\
|
||||
y0 += tp1 * t4; \
|
||||
y1 += tp1 * t5; \
|
||||
}
|
||||
|
||||
#define DLOAD_X8_SCALE_GP() \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
temp4 = alpha * x[4 * inc_x]; \
|
||||
temp5 = alpha * x[5 * inc_x]; \
|
||||
temp6 = alpha * x[6 * inc_x]; \
|
||||
temp7 = alpha * x[7 * inc_x]; \
|
||||
\
|
||||
tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
|
||||
tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
|
||||
tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
|
||||
tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
|
||||
tp4 = COPY_DOUBLE_TO_VECTOR(temp4); \
|
||||
tp5 = COPY_DOUBLE_TO_VECTOR(temp5); \
|
||||
tp6 = COPY_DOUBLE_TO_VECTOR(temp6); \
|
||||
tp7 = COPY_DOUBLE_TO_VECTOR(temp7); \
|
||||
|
||||
#define DLOAD_X4_SCALE_GP() \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
\
|
||||
tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
|
||||
tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
|
||||
tp2 = COPY_DOUBLE_TO_VECTOR(temp2); \
|
||||
tp3 = COPY_DOUBLE_TO_VECTOR(temp3); \
|
||||
|
||||
#define DLOAD_X8_SCALE_VECTOR() \
|
||||
LD_DP4(x, 2, x0, x1, x2, x3); \
|
||||
\
|
||||
x0 = x0 * v_alpha; \
|
||||
x1 = x1 * v_alpha; \
|
||||
x2 = x2 * v_alpha; \
|
||||
x3 = x3 * v_alpha; \
|
||||
\
|
||||
SPLATI_D2_DP(x0, tp0, tp1); \
|
||||
SPLATI_D2_DP(x1, tp2, tp3); \
|
||||
SPLATI_D2_DP(x2, tp4, tp5); \
|
||||
SPLATI_D2_DP(x3, tp6, tp7); \
|
||||
|
||||
#define DLOAD_X4_SCALE_VECTOR() \
|
||||
LD_DP2(x, 2, x0, x1); \
|
||||
\
|
||||
x0 = x0 * v_alpha; \
|
||||
x1 = x1 * v_alpha; \
|
||||
\
|
||||
SPLATI_D2_DP(x0, tp0, tp1); \
|
||||
SPLATI_D2_DP(x1, tp2, tp3); \
|
||||
|
||||
#define DLOAD_Y8_GP() \
|
||||
y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
|
||||
y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
|
||||
y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
|
||||
y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
|
||||
y2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 4 * inc_y))); \
|
||||
y2 = (v2f64) __msa_insert_d((v2i64) y2, 1, *((long long *)(y + 5 * inc_y))); \
|
||||
y3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 6 * inc_y))); \
|
||||
y3 = (v2f64) __msa_insert_d((v2i64) y3, 1, *((long long *)(y + 7 * inc_y))); \
|
||||
|
||||
#define DLOAD_Y4_GP() \
|
||||
y0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 0 * inc_y))); \
|
||||
y0 = (v2f64) __msa_insert_d((v2i64) y0, 1, *((long long *)(y + 1 * inc_y))); \
|
||||
y1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(y + 2 * inc_y))); \
|
||||
y1 = (v2f64) __msa_insert_d((v2i64) y1, 1, *((long long *)(y + 3 * inc_y))); \
|
||||
|
||||
#define DLOAD_Y8_VECTOR() LD_DP4(y, 2, y0, y1, y2, y3);
|
||||
#define DLOAD_Y4_VECTOR() LD_DP2(y, 2, y0, y1);
|
||||
|
||||
#define DSTORE_Y8_GP() \
|
||||
*((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
|
||||
*((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
|
||||
*((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
|
||||
*((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
|
||||
*((long long *)(y + 4 * inc_y)) = __msa_copy_s_d((v2i64) y2, 0); \
|
||||
*((long long *)(y + 5 * inc_y)) = __msa_copy_s_d((v2i64) y2, 1); \
|
||||
*((long long *)(y + 6 * inc_y)) = __msa_copy_s_d((v2i64) y3, 0); \
|
||||
*((long long *)(y + 7 * inc_y)) = __msa_copy_s_d((v2i64) y3, 1); \
|
||||
|
||||
#define DSTORE_Y4_GP() \
|
||||
*((long long *)(y + 0 * inc_y)) = __msa_copy_s_d((v2i64) y0, 0); \
|
||||
*((long long *)(y + 1 * inc_y)) = __msa_copy_s_d((v2i64) y0, 1); \
|
||||
*((long long *)(y + 2 * inc_y)) = __msa_copy_s_d((v2i64) y1, 0); \
|
||||
*((long long *)(y + 3 * inc_y)) = __msa_copy_s_d((v2i64) y1, 1); \
|
||||
|
||||
#define DSTORE_Y8_VECTOR() ST_DP4(y0, y1, y2, y3, y, 2);
|
||||
#define DSTORE_Y4_VECTOR() ST_DP2(y0, y1, y, 2);
|
||||
|
||||
#define DGEMV_N_MSA() \
|
||||
for (j = (n >> 3); j--;) \
|
||||
{ \
|
||||
DLOAD_X8_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_Y8(); \
|
||||
DGEMV_N_8x8(); \
|
||||
DSTORE_Y8(); \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_Y4(); \
|
||||
DGEMV_N_4x8(); \
|
||||
DSTORE_Y4(); \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
temp4 = alpha * x[4 * inc_x]; \
|
||||
temp5 = alpha * x[5 * inc_x]; \
|
||||
temp6 = alpha * x[6 * inc_x]; \
|
||||
temp7 = alpha * x[7 * inc_x]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp = y[0]; \
|
||||
temp += temp0 * pa0[k]; \
|
||||
temp += temp1 * pa1[k]; \
|
||||
temp += temp2 * pa2[k]; \
|
||||
temp += temp3 * pa3[k]; \
|
||||
temp += temp4 * pa4[k]; \
|
||||
temp += temp5 * pa5[k]; \
|
||||
temp += temp6 * pa6[k]; \
|
||||
temp += temp7 * pa7[k]; \
|
||||
y[0] = temp; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
pa0 += 8 * lda; \
|
||||
pa1 += 8 * lda; \
|
||||
pa2 += 8 * lda; \
|
||||
pa3 += 8 * lda; \
|
||||
pa4 += 8 * lda; \
|
||||
pa5 += 8 * lda; \
|
||||
pa6 += 8 * lda; \
|
||||
pa7 += 8 * lda; \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
} \
|
||||
\
|
||||
if (n & 4) \
|
||||
{ \
|
||||
DLOAD_X4_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_Y8(); \
|
||||
DGEMV_N_8x4(); \
|
||||
DSTORE_Y8(); \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_Y4(); \
|
||||
DGEMV_N_4x4(); \
|
||||
DSTORE_Y4(); \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp = y[0]; \
|
||||
temp += temp0 * pa0[k]; \
|
||||
temp += temp1 * pa1[k]; \
|
||||
temp += temp2 * pa2[k]; \
|
||||
temp += temp3 * pa3[k]; \
|
||||
y[0] = temp; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
pa0 += 4 * lda; \
|
||||
pa1 += 4 * lda; \
|
||||
pa2 += 4 * lda; \
|
||||
pa3 += 4 * lda; \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
\
|
||||
tp0 = COPY_DOUBLE_TO_VECTOR(temp0); \
|
||||
tp1 = COPY_DOUBLE_TO_VECTOR(temp1); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_Y8(); \
|
||||
DGEMV_N_8x2(); \
|
||||
DSTORE_Y8(); \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_Y4(); \
|
||||
DGEMV_N_4x2(); \
|
||||
DSTORE_Y4(); \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp = y[0]; \
|
||||
temp += temp0 * pa0[k]; \
|
||||
temp += temp1 * pa1[k]; \
|
||||
y[0] = temp; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
pa0 += 2 * lda; \
|
||||
pa1 += 2 * lda; \
|
||||
\
|
||||
x += 2 * inc_x; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
temp = alpha * x[0]; \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = m; i--;) \
|
||||
{ \
|
||||
y[0] += temp * pa0[k]; \
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||
FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *y_org = y;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
v2f64 v_alpha;
|
||||
v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
|
||||
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
|
||||
v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
|
||||
|
||||
v_alpha = COPY_DOUBLE_TO_VECTOR(alpha);
|
||||
|
||||
pa0 = A;
|
||||
pa1 = A + lda;
|
||||
pa2 = A + 2 * lda;
|
||||
pa3 = A + 3 * lda;
|
||||
pa4 = A + 4 * lda;
|
||||
pa5 = A + 5 * lda;
|
||||
pa6 = A + 6 * lda;
|
||||
pa7 = A + 7 * lda;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
|
||||
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
|
||||
#define DLOAD_Y8 DLOAD_Y8_VECTOR
|
||||
#define DLOAD_Y4 DLOAD_Y4_VECTOR
|
||||
#define DSTORE_Y8 DSTORE_Y8_VECTOR
|
||||
#define DSTORE_Y4 DSTORE_Y4_VECTOR
|
||||
|
||||
DGEMV_N_MSA();
|
||||
|
||||
#undef DLOAD_X8_SCALE
|
||||
#undef DLOAD_X4_SCALE
|
||||
#undef DLOAD_Y8
|
||||
#undef DLOAD_Y4
|
||||
#undef DSTORE_Y8
|
||||
#undef DSTORE_Y4
|
||||
}
|
||||
else if (1 == inc_y)
|
||||
{
|
||||
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
|
||||
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
|
||||
#define DLOAD_Y8 DLOAD_Y8_VECTOR
|
||||
#define DLOAD_Y4 DLOAD_Y4_VECTOR
|
||||
#define DSTORE_Y8 DSTORE_Y8_VECTOR
|
||||
#define DSTORE_Y4 DSTORE_Y4_VECTOR
|
||||
|
||||
DGEMV_N_MSA();
|
||||
|
||||
#undef DLOAD_X8_SCALE
|
||||
#undef DLOAD_X4_SCALE
|
||||
#undef DLOAD_Y8
|
||||
#undef DLOAD_Y4
|
||||
#undef DSTORE_Y8
|
||||
#undef DSTORE_Y4
|
||||
}
|
||||
else if (1 == inc_x)
|
||||
{
|
||||
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_VECTOR
|
||||
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_VECTOR
|
||||
#define DLOAD_Y8 DLOAD_Y8_GP
|
||||
#define DLOAD_Y4 DLOAD_Y4_GP
|
||||
#define DSTORE_Y8 DSTORE_Y8_GP
|
||||
#define DSTORE_Y4 DSTORE_Y4_GP
|
||||
|
||||
DGEMV_N_MSA();
|
||||
|
||||
#undef DLOAD_X8_SCALE
|
||||
#undef DLOAD_X4_SCALE
|
||||
#undef DLOAD_Y8
|
||||
#undef DLOAD_Y4
|
||||
#undef DSTORE_Y8
|
||||
#undef DSTORE_Y4
|
||||
}
|
||||
else
|
||||
{
|
||||
#define DLOAD_X8_SCALE DLOAD_X8_SCALE_GP
|
||||
#define DLOAD_X4_SCALE DLOAD_X4_SCALE_GP
|
||||
#define DLOAD_Y8 DLOAD_Y8_GP
|
||||
#define DLOAD_Y4 DLOAD_Y4_GP
|
||||
#define DSTORE_Y8 DSTORE_Y8_GP
|
||||
#define DSTORE_Y4 DSTORE_Y4_GP
|
||||
|
||||
DGEMV_N_MSA();
|
||||
|
||||
#undef DLOAD_X8_SCALE
|
||||
#undef DLOAD_X4_SCALE
|
||||
#undef DLOAD_Y8
|
||||
#undef DLOAD_Y4
|
||||
#undef DSTORE_Y8
|
||||
#undef DSTORE_Y4
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,589 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define DGEMV_T_8x8() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
LD_DP4(pa4 + k, 2, t16, t17, t18, t19); \
|
||||
LD_DP4(pa5 + k, 2, t20, t21, t22, t23); \
|
||||
LD_DP4(pa6 + k, 2, t24, t25, t26, t27); \
|
||||
LD_DP4(pa7 + k, 2, t28, t29, t30, t31); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
tp0 += x2 * t2; \
|
||||
tp0 += x3 * t3; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
tp1 += x2 * t6; \
|
||||
tp1 += x3 * t7; \
|
||||
\
|
||||
tp2 += x0 * t8; \
|
||||
tp2 += x1 * t9; \
|
||||
tp2 += x2 * t10; \
|
||||
tp2 += x3 * t11; \
|
||||
\
|
||||
tp3 += x0 * t12; \
|
||||
tp3 += x1 * t13; \
|
||||
tp3 += x2 * t14; \
|
||||
tp3 += x3 * t15; \
|
||||
\
|
||||
tp4 += x0 * t16; \
|
||||
tp4 += x1 * t17; \
|
||||
tp4 += x2 * t18; \
|
||||
tp4 += x3 * t19; \
|
||||
\
|
||||
tp5 += x0 * t20; \
|
||||
tp5 += x1 * t21; \
|
||||
tp5 += x2 * t22; \
|
||||
tp5 += x3 * t23; \
|
||||
\
|
||||
tp6 += x0 * t24; \
|
||||
tp6 += x1 * t25; \
|
||||
tp6 += x2 * t26; \
|
||||
tp6 += x3 * t27; \
|
||||
\
|
||||
tp7 += x0 * t28; \
|
||||
tp7 += x1 * t29; \
|
||||
tp7 += x2 * t30; \
|
||||
tp7 += x3 * t31; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_8x4() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
LD_DP2(pa4 + k, 2, t16, t17); \
|
||||
LD_DP2(pa5 + k, 2, t20, t21); \
|
||||
LD_DP2(pa6 + k, 2, t24, t25); \
|
||||
LD_DP2(pa7 + k, 2, t28, t29); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
\
|
||||
tp2 += x0 * t8; \
|
||||
tp2 += x1 * t9; \
|
||||
\
|
||||
tp3 += x0 * t12; \
|
||||
tp3 += x1 * t13; \
|
||||
\
|
||||
tp4 += x0 * t16; \
|
||||
tp4 += x1 * t17; \
|
||||
\
|
||||
tp5 += x0 * t20; \
|
||||
tp5 += x1 * t21; \
|
||||
\
|
||||
tp6 += x0 * t24; \
|
||||
tp6 += x1 * t25; \
|
||||
\
|
||||
tp7 += x0 * t28; \
|
||||
tp7 += x1 * t29; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_8x2() \
|
||||
{ \
|
||||
t0 = LD_DP(pa0 + k); \
|
||||
t4 = LD_DP(pa1 + k); \
|
||||
t8 = LD_DP(pa2 + k); \
|
||||
t12 = LD_DP(pa3 + k); \
|
||||
t16 = LD_DP(pa4 + k); \
|
||||
t20 = LD_DP(pa5 + k); \
|
||||
t24 = LD_DP(pa6 + k); \
|
||||
t28 = LD_DP(pa7 + k); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp1 += x0 * t4; \
|
||||
tp2 += x0 * t8; \
|
||||
tp3 += x0 * t12; \
|
||||
tp4 += x0 * t16; \
|
||||
tp5 += x0 * t20; \
|
||||
tp6 += x0 * t24; \
|
||||
tp7 += x0 * t28; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_4x8() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
tp0 += x2 * t2; \
|
||||
tp0 += x3 * t3; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
tp1 += x2 * t6; \
|
||||
tp1 += x3 * t7; \
|
||||
\
|
||||
tp2 += x0 * t8; \
|
||||
tp2 += x1 * t9; \
|
||||
tp2 += x2 * t10; \
|
||||
tp2 += x3 * t11; \
|
||||
\
|
||||
tp3 += x0 * t12; \
|
||||
tp3 += x1 * t13; \
|
||||
tp3 += x2 * t14; \
|
||||
tp3 += x3 * t15; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_4x4() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
\
|
||||
tp2 += x0 * t8; \
|
||||
tp2 += x1 * t9; \
|
||||
\
|
||||
tp3 += x0 * t12; \
|
||||
tp3 += x1 * t13; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_4x2() \
|
||||
{ \
|
||||
t0 = LD_DP(pa0 + k); \
|
||||
t4 = LD_DP(pa1 + k); \
|
||||
t8 = LD_DP(pa2 + k); \
|
||||
t12 = LD_DP(pa3 + k); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp1 += x0 * t4; \
|
||||
tp2 += x0 * t8; \
|
||||
tp3 += x0 * t12; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_2x8() \
|
||||
{ \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
tp0 += x2 * t2; \
|
||||
tp0 += x3 * t3; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
tp1 += x2 * t6; \
|
||||
tp1 += x3 * t7; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_2x4() \
|
||||
{ \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
\
|
||||
tp1 += x0 * t4; \
|
||||
tp1 += x1 * t5; \
|
||||
}
|
||||
|
||||
#define DGEMV_T_2x2() \
|
||||
{ \
|
||||
t0 = LD_DP(pa0 + k); \
|
||||
t4 = LD_DP(pa1 + k); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp1 += x0 * t4; \
|
||||
}
|
||||
|
||||
#define DLOAD_X8_GP() \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
|
||||
x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
|
||||
x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
|
||||
x2 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 4 * inc_x))); \
|
||||
x2 = (v2f64) __msa_insert_d((v2i64) x2, 1, *((long long *)(x + 5 * inc_x))); \
|
||||
x3 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 6 * inc_x))); \
|
||||
x3 = (v2f64) __msa_insert_d((v2i64) x3, 1, *((long long *)(x + 7 * inc_x))); \
|
||||
|
||||
#define DLOAD_X4_GP() \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
|
||||
x1 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 2 * inc_x))); \
|
||||
x1 = (v2f64) __msa_insert_d((v2i64) x1, 1, *((long long *)(x + 3 * inc_x))); \
|
||||
|
||||
#define DLOAD_X2_GP() \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) tp0, 0, *((long long *)(x + 0 * inc_x))); \
|
||||
x0 = (v2f64) __msa_insert_d((v2i64) x0, 1, *((long long *)(x + 1 * inc_x))); \
|
||||
|
||||
#define DLOAD_X8_VECTOR() LD_DP4(x, 2, x0, x1, x2, x3);
|
||||
#define DLOAD_X4_VECTOR() LD_DP2(x, 2, x0, x1);
|
||||
#define DLOAD_X2_VECTOR() x0 = LD_DP(x);
|
||||
|
||||
#define DGEMV_T_MSA() \
|
||||
for (j = (n >> 3); j--;) \
|
||||
{ \
|
||||
tp0 = zero; \
|
||||
tp1 = zero; \
|
||||
tp2 = zero; \
|
||||
tp3 = zero; \
|
||||
tp4 = zero; \
|
||||
tp5 = zero; \
|
||||
tp6 = zero; \
|
||||
tp7 = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_X8(); \
|
||||
DGEMV_T_8x8(); \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_X4(); \
|
||||
DGEMV_T_8x4(); \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
DLOAD_X2(); \
|
||||
DGEMV_T_8x2(); \
|
||||
\
|
||||
x += 2 * inc_x; \
|
||||
k += 2; \
|
||||
} \
|
||||
\
|
||||
ILVRL_D2_DP(tp1, tp0, t0, t4); \
|
||||
ILVRL_D2_DP(tp3, tp2, t1, t5); \
|
||||
ILVRL_D2_DP(tp5, tp4, t2, t6); \
|
||||
ILVRL_D2_DP(tp7, tp6, t3, t7); \
|
||||
ADD2(t0, t4, t1, t5, t0, t1); \
|
||||
ADD2(t2, t6, t3, t7, t2, t3); \
|
||||
\
|
||||
temp0 = t0[0]; \
|
||||
temp1 = t0[1]; \
|
||||
temp2 = t1[0]; \
|
||||
temp3 = t1[1]; \
|
||||
temp4 = t2[0]; \
|
||||
temp5 = t2[1]; \
|
||||
temp6 = t3[0]; \
|
||||
temp7 = t3[1]; \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
temp1 += pa1[k] * x[0]; \
|
||||
temp2 += pa2[k] * x[0]; \
|
||||
temp3 += pa3[k] * x[0]; \
|
||||
temp4 += pa4[k] * x[0]; \
|
||||
temp5 += pa5[k] * x[0]; \
|
||||
temp6 += pa6[k] * x[0]; \
|
||||
temp7 += pa7[k] * x[0]; \
|
||||
\
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
res0 = y[0 * inc_y]; \
|
||||
res1 = y[1 * inc_y]; \
|
||||
res2 = y[2 * inc_y]; \
|
||||
res3 = y[3 * inc_y]; \
|
||||
res4 = y[4 * inc_y]; \
|
||||
res5 = y[5 * inc_y]; \
|
||||
res6 = y[6 * inc_y]; \
|
||||
res7 = y[7 * inc_y]; \
|
||||
\
|
||||
res0 += alpha * temp0; \
|
||||
res1 += alpha * temp1; \
|
||||
res2 += alpha * temp2; \
|
||||
res3 += alpha * temp3; \
|
||||
res4 += alpha * temp4; \
|
||||
res5 += alpha * temp5; \
|
||||
res6 += alpha * temp6; \
|
||||
res7 += alpha * temp7; \
|
||||
\
|
||||
y[0 * inc_y] = res0; \
|
||||
y[1 * inc_y] = res1; \
|
||||
y[2 * inc_y] = res2; \
|
||||
y[3 * inc_y] = res3; \
|
||||
y[4 * inc_y] = res4; \
|
||||
y[5 * inc_y] = res5; \
|
||||
y[6 * inc_y] = res6; \
|
||||
y[7 * inc_y] = res7; \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
\
|
||||
pa0 += 8 * lda; \
|
||||
pa1 += 8 * lda; \
|
||||
pa2 += 8 * lda; \
|
||||
pa3 += 8 * lda; \
|
||||
pa4 += 8 * lda; \
|
||||
pa5 += 8 * lda; \
|
||||
pa6 += 8 * lda; \
|
||||
pa7 += 8 * lda; \
|
||||
} \
|
||||
\
|
||||
if (n & 4) \
|
||||
{ \
|
||||
tp0 = zero; \
|
||||
tp1 = zero; \
|
||||
tp2 = zero; \
|
||||
tp3 = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_X8(); \
|
||||
DGEMV_T_4x8(); \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_X4(); \
|
||||
DGEMV_T_4x4(); \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
DLOAD_X2(); \
|
||||
DGEMV_T_4x2(); \
|
||||
\
|
||||
x += 2 * inc_x; \
|
||||
k += 2; \
|
||||
} \
|
||||
\
|
||||
ILVRL_D2_DP(tp1, tp0, t0, t4); \
|
||||
ILVRL_D2_DP(tp3, tp2, t1, t5); \
|
||||
ADD2(t0, t4, t1, t5, t0, t1); \
|
||||
\
|
||||
temp0 = t0[0]; \
|
||||
temp1 = t0[1]; \
|
||||
temp2 = t1[0]; \
|
||||
temp3 = t1[1]; \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
temp1 += pa1[k] * x[0]; \
|
||||
temp2 += pa2[k] * x[0]; \
|
||||
temp3 += pa3[k] * x[0]; \
|
||||
\
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
res0 = y[0 * inc_y]; \
|
||||
res1 = y[1 * inc_y]; \
|
||||
res2 = y[2 * inc_y]; \
|
||||
res3 = y[3 * inc_y]; \
|
||||
\
|
||||
res0 += alpha * temp0; \
|
||||
res1 += alpha * temp1; \
|
||||
res2 += alpha * temp2; \
|
||||
res3 += alpha * temp3; \
|
||||
\
|
||||
y[0 * inc_y] = res0; \
|
||||
y[1 * inc_y] = res1; \
|
||||
y[2 * inc_y] = res2; \
|
||||
y[3 * inc_y] = res3; \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
\
|
||||
pa0 += 4 * lda; \
|
||||
pa1 += 4 * lda; \
|
||||
pa2 += 4 * lda; \
|
||||
pa3 += 4 * lda; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
tp0 = zero; \
|
||||
tp1 = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
DLOAD_X8(); \
|
||||
DGEMV_T_2x8(); \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
DLOAD_X4(); \
|
||||
DGEMV_T_2x4(); \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
DLOAD_X2(); \
|
||||
DGEMV_T_2x2(); \
|
||||
\
|
||||
x += 2 * inc_x; \
|
||||
k += 2; \
|
||||
} \
|
||||
\
|
||||
ILVRL_D2_DP(tp1, tp0, t0, t4); \
|
||||
\
|
||||
t0 += t4; \
|
||||
\
|
||||
temp0 = t0[0]; \
|
||||
temp1 = t0[1]; \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
temp1 += pa1[k] * x[0]; \
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
res0 = y[0 * inc_y]; \
|
||||
res1 = y[1 * inc_y]; \
|
||||
\
|
||||
res0 += alpha * temp0; \
|
||||
res1 += alpha * temp1; \
|
||||
\
|
||||
y[0 * inc_y] = res0; \
|
||||
y[1 * inc_y] = res1; \
|
||||
\
|
||||
y += 2 * inc_y; \
|
||||
\
|
||||
pa0 += 2 * lda; \
|
||||
pa1 += 2 * lda; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
temp0 = 0.0; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = m; i--;) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
y[0] += alpha * temp0; \
|
||||
y += inc_y; \
|
||||
pa0 += lda; \
|
||||
}
|
||||
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||
FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *srcx_org = x;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
|
||||
v2f64 x0, x1, x2, x3;
|
||||
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v2f64 t16, t17, t18, t19, t20, t21, t22, t23, t24, t25, t26, t27, t28, t29;
|
||||
v2f64 t30, t31, tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
|
||||
v2f64 zero = {0};
|
||||
|
||||
pa0 = A + 0 * lda;
|
||||
pa1 = A + 1 * lda;
|
||||
pa2 = A + 2 * lda;
|
||||
pa3 = A + 3 * lda;
|
||||
pa4 = A + 4 * lda;
|
||||
pa5 = A + 5 * lda;
|
||||
pa6 = A + 6 * lda;
|
||||
pa7 = A + 7 * lda;
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
#define DLOAD_X8 DLOAD_X8_VECTOR
|
||||
#define DLOAD_X4 DLOAD_X4_VECTOR
|
||||
#define DLOAD_X2 DLOAD_X2_VECTOR
|
||||
|
||||
DGEMV_T_MSA();
|
||||
|
||||
#undef DLOAD_X8
|
||||
#undef DLOAD_X4
|
||||
#undef DLOAD_X2
|
||||
}
|
||||
else
|
||||
{
|
||||
#define DLOAD_X8 DLOAD_X8_GP
|
||||
#define DLOAD_X4 DLOAD_X4_GP
|
||||
#define DLOAD_X2 DLOAD_X2_GP
|
||||
|
||||
DGEMV_T_MSA();
|
||||
|
||||
#undef DLOAD_X8
|
||||
#undef DLOAD_X4
|
||||
#undef DLOAD_X2
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,333 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define AND_VEC_W(in) ((v4f32) ((v4i32) in & and_vec))
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
FLOAT data0, data1, data2, sumf = 0.0;
|
||||
v4f32 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v4f32 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
|
||||
v4f32 zero_v = {0};
|
||||
v4i32 and_vec = {0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (sumf);
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
if (n > 31)
|
||||
{
|
||||
n -= 32;
|
||||
|
||||
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 = AND_VEC_W(src0);
|
||||
sum_abs1 = AND_VEC_W(src1);
|
||||
sum_abs2 = AND_VEC_W(src2);
|
||||
sum_abs3 = AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
sum_abs2 = zero_v;
|
||||
sum_abs3 = zero_v;
|
||||
}
|
||||
|
||||
for (i = 0; i < (n >> 5); i++)
|
||||
{
|
||||
LD_SP8_INC(x, 4, src0, src1, src2, src3, src4, src5, src6, src7);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
sum_abs3 += AND_VEC_W(src7);
|
||||
}
|
||||
|
||||
if (n & 31)
|
||||
{
|
||||
if ((n & 16) && (n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP7_INC(x, 4, src0, src1, src2, src3, src4, src5, src6);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
sum_abs2 += AND_VEC_W(src6);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if ((n & 16) && (n & 8))
|
||||
{
|
||||
LD_SP6_INC(x, 4, src0, src1, src2, src3, src4, src5);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
sum_abs1 += AND_VEC_W(src5);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if ((n & 16) && (n & 4))
|
||||
{
|
||||
LD_SP5_INC(x, 4, src0, src1, src2, src3, src4);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
sum_abs0 += AND_VEC_W(src4);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP3_INC(x, 4, src0, src1, src2);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if (n & 16)
|
||||
{
|
||||
LD_SP4_INC(x, 4, src0, src1, src2, src3);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
sum_abs2 += AND_VEC_W(src2);
|
||||
sum_abs3 += AND_VEC_W(src3);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_SP2_INC(x, 4, src0, src1);
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src1);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
src0 = LD_SP(x); x += 4;
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
|
||||
if (n & 2)
|
||||
{
|
||||
sumf += fabsf(*(x + 0));
|
||||
sumf += fabsf(*(x + 1));
|
||||
x += 2;
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
sumf += fabsf(*(x + 0));
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = sum_abs0 + sum_abs1 + sum_abs2 + sum_abs3;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
}
|
||||
}
|
||||
else
|
||||
{
|
||||
if (n > 8)
|
||||
{
|
||||
n -= 8;
|
||||
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
|
||||
sum_abs0 = AND_VEC_W(src0);
|
||||
sum_abs1 = AND_VEC_W(src4);
|
||||
}
|
||||
else
|
||||
{
|
||||
sum_abs0 = zero_v;
|
||||
sum_abs1 = zero_v;
|
||||
}
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
src4 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
src4 = (v4f32) __msa_insert_w((v4i32) src4, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
src4 = (v4f32) __msa_insert_w((v4i32) src4, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
src4 = (v4f32) __msa_insert_w((v4i32) src4, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
sum_abs1 += AND_VEC_W(src4);
|
||||
}
|
||||
|
||||
if (n & 4)
|
||||
{
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) zero_v, 0, *((int *) x));
|
||||
x += inc_x;
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) src0, 1, *((int *) x));
|
||||
x += inc_x;
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) src0, 2, *((int *) x));
|
||||
x += inc_x;
|
||||
src0 = (v4f32) __msa_insert_w((v4i32) src0, 3, *((int *) x));
|
||||
x += inc_x;
|
||||
|
||||
sum_abs0 += AND_VEC_W(src0);
|
||||
}
|
||||
|
||||
sum_abs0 += sum_abs1;
|
||||
|
||||
sumf += sum_abs0[0];
|
||||
sumf += sum_abs0[1];
|
||||
sumf += sum_abs0[2];
|
||||
sumf += sum_abs0[3];
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
data0 = fabsf(*x); x += inc_x;
|
||||
data1 = fabsf(*x); x += inc_x;
|
||||
data2 = fabsf(*x);
|
||||
|
||||
sumf += data0;
|
||||
sumf += data1;
|
||||
sumf += data2;
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
data0 = fabsf(*x); x += inc_x;
|
||||
data1 = fabsf(*x);
|
||||
|
||||
sumf += data0;
|
||||
sumf += data1;
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
data0 = fabsf(*x);
|
||||
|
||||
sumf += data0;
|
||||
}
|
||||
}
|
||||
|
||||
return (sumf);
|
||||
}
|
|
@ -0,0 +1,208 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
/* return float, x,y float */
|
||||
#if defined(DSDOT)
|
||||
double CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
double dot = 0.0;
|
||||
float x0, x1, x2, x3, y0, y1, y2, y3;
|
||||
v4f32 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
|
||||
v4f32 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
|
||||
v4f32 dot0 = {0, 0, 0, 0};
|
||||
|
||||
if (n < 0) return (dot);
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
for (i = (n >> 5); i--;)
|
||||
{
|
||||
LD_SP8_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
|
||||
LD_SP8_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
dot0 += (vy5 * vx5);
|
||||
dot0 += (vy6 * vx6);
|
||||
dot0 += (vy7 * vx7);
|
||||
}
|
||||
|
||||
if (n & 31)
|
||||
{
|
||||
if ((n & 16) && (n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP7_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5, vx6);
|
||||
LD_SP7_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5, vy6);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
dot0 += (vy5 * vx5);
|
||||
dot0 += (vy6 * vx6);
|
||||
}
|
||||
else if ((n & 16) && (n & 8))
|
||||
{
|
||||
LD_SP6_INC(x, 4, vx0, vx1, vx2, vx3, vx4, vx5);
|
||||
LD_SP6_INC(y, 4, vy0, vy1, vy2, vy3, vy4, vy5);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
dot0 += (vy5 * vx5);
|
||||
}
|
||||
else if ((n & 16) && (n & 4))
|
||||
{
|
||||
LD_SP5_INC(x, 4, vx0, vx1, vx2, vx3, vx4);
|
||||
LD_SP5_INC(y, 4, vy0, vy1, vy2, vy3, vy4);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
dot0 += (vy4 * vx4);
|
||||
}
|
||||
else if ((n & 8) && (n & 4))
|
||||
{
|
||||
LD_SP3_INC(x, 4, vx0, vx1, vx2);
|
||||
LD_SP3_INC(y, 4, vy0, vy1, vy2);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
}
|
||||
else if (n & 16)
|
||||
{
|
||||
LD_SP4_INC(x, 4, vx0, vx1, vx2, vx3);
|
||||
LD_SP4_INC(y, 4, vy0, vy1, vy2, vy3);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
dot0 += (vy2 * vx2);
|
||||
dot0 += (vy3 * vx3);
|
||||
}
|
||||
else if (n & 8)
|
||||
{
|
||||
LD_SP2_INC(x, 4, vx0, vx1);
|
||||
LD_SP2_INC(y, 4, vy0, vy1);
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
dot0 += (vy1 * vx1);
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
vx0 = LD_SP(x); x += 4;
|
||||
vy0 = LD_SP(y); y += 4;
|
||||
|
||||
dot0 += (vy0 * vx0);
|
||||
}
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP3_INC(x, 1, x0, x1, x2);
|
||||
LD_GP3_INC(y, 1, y0, y1, y2);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
dot += (y2 * x2);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, 1, x0, x1);
|
||||
LD_GP2_INC(y, 1, y0, y1);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = *x;
|
||||
y0 = *y;
|
||||
|
||||
dot += (y0 * x0);
|
||||
}
|
||||
}
|
||||
|
||||
dot += dot0[0];
|
||||
dot += dot0[1];
|
||||
dot += dot0[2];
|
||||
dot += dot0[3];
|
||||
}
|
||||
else
|
||||
{
|
||||
for (i = (n >> 2); i--;)
|
||||
{
|
||||
LD_GP4_INC(x, inc_x, x0, x1, x2, x3);
|
||||
LD_GP4_INC(y, inc_y, y0, y1, y2, y3);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
dot += (y2 * x2);
|
||||
dot += (y3 * x3);
|
||||
}
|
||||
|
||||
if ((n & 2) && (n & 1))
|
||||
{
|
||||
LD_GP3_INC(x, inc_x, x0, x1, x2);
|
||||
LD_GP3_INC(y, inc_y, y0, y1, y2);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
dot += (y2 * x2);
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_GP2_INC(x, inc_x, x0, x1);
|
||||
LD_GP2_INC(y, inc_y, y0, y1);
|
||||
|
||||
dot += (y0 * x0);
|
||||
dot += (y1 * x1);
|
||||
}
|
||||
else if (n & 1)
|
||||
{
|
||||
x0 = *x;
|
||||
y0 = *y;
|
||||
|
||||
dot += (y0 * x0);
|
||||
}
|
||||
}
|
||||
|
||||
return (dot);
|
||||
}
|
|
@ -0,0 +1,515 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define SGEMV_N_8x8() \
|
||||
{ \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t2, t3); \
|
||||
LD_SP2(pa2 + k, 4, t4, t5); \
|
||||
LD_SP2(pa3 + k, 4, t6, t7); \
|
||||
LD_SP2(pa4 + k, 4, t8, t9); \
|
||||
LD_SP2(pa5 + k, 4, t10, t11); \
|
||||
LD_SP2(pa6 + k, 4, t12, t13); \
|
||||
LD_SP2(pa7 + k, 4, t14, t15); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
\
|
||||
y0 += tp1 * t2; \
|
||||
y1 += tp1 * t3; \
|
||||
\
|
||||
y0 += tp2 * t4; \
|
||||
y1 += tp2 * t5; \
|
||||
\
|
||||
y0 += tp3 * t6; \
|
||||
y1 += tp3 * t7; \
|
||||
\
|
||||
y0 += tp4 * t8; \
|
||||
y1 += tp4 * t9; \
|
||||
\
|
||||
y0 += tp5 * t10; \
|
||||
y1 += tp5 * t11; \
|
||||
\
|
||||
y0 += tp6 * t12; \
|
||||
y1 += tp6 * t13; \
|
||||
\
|
||||
y0 += tp7 * t14; \
|
||||
y1 += tp7 * t15; \
|
||||
}
|
||||
|
||||
#define SGEMV_N_4x8() \
|
||||
{ \
|
||||
t0 = LD_SP(pa0 + k); \
|
||||
t2 = LD_SP(pa1 + k); \
|
||||
t4 = LD_SP(pa2 + k); \
|
||||
t6 = LD_SP(pa3 + k); \
|
||||
t8 = LD_SP(pa4 + k); \
|
||||
t10 = LD_SP(pa5 + k); \
|
||||
t12 = LD_SP(pa6 + k); \
|
||||
t14 = LD_SP(pa7 + k); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y0 += tp1 * t2; \
|
||||
y0 += tp2 * t4; \
|
||||
y0 += tp3 * t6; \
|
||||
y0 += tp4 * t8; \
|
||||
y0 += tp5 * t10; \
|
||||
y0 += tp6 * t12; \
|
||||
y0 += tp7 * t14; \
|
||||
}
|
||||
|
||||
#define SGEMV_N_8x4() \
|
||||
{ \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t2, t3); \
|
||||
LD_SP2(pa2 + k, 4, t4, t5); \
|
||||
LD_SP2(pa3 + k, 4, t6, t7); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
\
|
||||
y0 += tp1 * t2; \
|
||||
y1 += tp1 * t3; \
|
||||
\
|
||||
y0 += tp2 * t4; \
|
||||
y1 += tp2 * t5; \
|
||||
\
|
||||
y0 += tp3 * t6; \
|
||||
y1 += tp3 * t7; \
|
||||
}
|
||||
|
||||
#define SGEMV_N_4x4() \
|
||||
{ \
|
||||
t0 = LD_SP(pa0 + k); \
|
||||
t2 = LD_SP(pa1 + k); \
|
||||
t4 = LD_SP(pa2 + k); \
|
||||
t6 = LD_SP(pa3 + k); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y0 += tp1 * t2; \
|
||||
y0 += tp2 * t4; \
|
||||
y0 += tp3 * t6; \
|
||||
}
|
||||
|
||||
#define SGEMV_N_8x2() \
|
||||
{ \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t2, t3); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y1 += tp0 * t1; \
|
||||
\
|
||||
y0 += tp1 * t2; \
|
||||
y1 += tp1 * t3; \
|
||||
}
|
||||
|
||||
#define SGEMV_N_4x2() \
|
||||
{ \
|
||||
t0 = LD_SP(pa0 + k); \
|
||||
t2 = LD_SP(pa1 + k); \
|
||||
\
|
||||
y0 += tp0 * t0; \
|
||||
y0 += tp1 * t2; \
|
||||
}
|
||||
|
||||
#define SLOAD_X8_SCALE_GP() \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
temp4 = alpha * x[4 * inc_x]; \
|
||||
temp5 = alpha * x[5 * inc_x]; \
|
||||
temp6 = alpha * x[6 * inc_x]; \
|
||||
temp7 = alpha * x[7 * inc_x]; \
|
||||
\
|
||||
tp0 = COPY_FLOAT_TO_VECTOR(temp0); \
|
||||
tp1 = COPY_FLOAT_TO_VECTOR(temp1); \
|
||||
tp2 = COPY_FLOAT_TO_VECTOR(temp2); \
|
||||
tp3 = COPY_FLOAT_TO_VECTOR(temp3); \
|
||||
tp4 = COPY_FLOAT_TO_VECTOR(temp4); \
|
||||
tp5 = COPY_FLOAT_TO_VECTOR(temp5); \
|
||||
tp6 = COPY_FLOAT_TO_VECTOR(temp6); \
|
||||
tp7 = COPY_FLOAT_TO_VECTOR(temp7); \
|
||||
|
||||
#define SLOAD_X4_SCALE_GP() \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
\
|
||||
tp0 = COPY_FLOAT_TO_VECTOR(temp0); \
|
||||
tp1 = COPY_FLOAT_TO_VECTOR(temp1); \
|
||||
tp2 = COPY_FLOAT_TO_VECTOR(temp2); \
|
||||
tp3 = COPY_FLOAT_TO_VECTOR(temp3); \
|
||||
|
||||
#define SLOAD_X8_SCALE_VECTOR() \
|
||||
LD_SP2(x, 4, x0, x1); \
|
||||
\
|
||||
x0 = x0 * v_alpha; \
|
||||
x1 = x1 * v_alpha; \
|
||||
\
|
||||
SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \
|
||||
SPLATI_W4_SP(x1, tp4, tp5, tp6, tp7); \
|
||||
|
||||
#define SLOAD_X4_SCALE_VECTOR() \
|
||||
x0 = LD_SP(x); \
|
||||
x0 = x0 * v_alpha; \
|
||||
SPLATI_W4_SP(x0, tp0, tp1, tp2, tp3); \
|
||||
|
||||
#define SLOAD_Y8_GP() \
|
||||
y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \
|
||||
y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \
|
||||
y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \
|
||||
y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \
|
||||
y1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 4 * inc_y))); \
|
||||
y1 = (v4f32) __msa_insert_w((v4i32) y1, 1, *((int *)(y + 5 * inc_y))); \
|
||||
y1 = (v4f32) __msa_insert_w((v4i32) y1, 2, *((int *)(y + 6 * inc_y))); \
|
||||
y1 = (v4f32) __msa_insert_w((v4i32) y1, 3, *((int *)(y + 7 * inc_y))); \
|
||||
|
||||
#define SLOAD_Y4_GP() \
|
||||
y0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(y + 0 * inc_y))); \
|
||||
y0 = (v4f32) __msa_insert_w((v4i32) y0, 1, *((int *)(y + 1 * inc_y))); \
|
||||
y0 = (v4f32) __msa_insert_w((v4i32) y0, 2, *((int *)(y + 2 * inc_y))); \
|
||||
y0 = (v4f32) __msa_insert_w((v4i32) y0, 3, *((int *)(y + 3 * inc_y))); \
|
||||
|
||||
#define SLOAD_Y8_VECTOR() LD_SP2(y, 4, y0, y1);
|
||||
#define SLOAD_Y4_VECTOR() y0 = LD_SP(y);
|
||||
|
||||
#define SSTORE_Y8_GP() \
|
||||
*((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \
|
||||
*((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \
|
||||
*((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \
|
||||
*((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \
|
||||
*((int *)(y + 4 * inc_y)) = __msa_copy_s_w((v4i32) y1, 0); \
|
||||
*((int *)(y + 5 * inc_y)) = __msa_copy_s_w((v4i32) y1, 1); \
|
||||
*((int *)(y + 6 * inc_y)) = __msa_copy_s_w((v4i32) y1, 2); \
|
||||
*((int *)(y + 7 * inc_y)) = __msa_copy_s_w((v4i32) y1, 3); \
|
||||
|
||||
#define SSTORE_Y4_GP() \
|
||||
*((int *)(y + 0 * inc_y)) = __msa_copy_s_w((v4i32) y0, 0); \
|
||||
*((int *)(y + 1 * inc_y)) = __msa_copy_s_w((v4i32) y0, 1); \
|
||||
*((int *)(y + 2 * inc_y)) = __msa_copy_s_w((v4i32) y0, 2); \
|
||||
*((int *)(y + 3 * inc_y)) = __msa_copy_s_w((v4i32) y0, 3); \
|
||||
|
||||
#define SSTORE_Y8_VECTOR() ST_SP2(y0, y1, y, 4);
|
||||
#define SSTORE_Y4_VECTOR() ST_SP(y0, y);
|
||||
|
||||
#define SGEMV_N_MSA() \
|
||||
for (j = (n >> 3); j--;) \
|
||||
{ \
|
||||
SLOAD_X8_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
SLOAD_Y8(); \
|
||||
SGEMV_N_8x8(); \
|
||||
SSTORE_Y8(); \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
SLOAD_Y4(); \
|
||||
SGEMV_N_4x8(); \
|
||||
SSTORE_Y4(); \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
temp4 = alpha * x[4 * inc_x]; \
|
||||
temp5 = alpha * x[5 * inc_x]; \
|
||||
temp6 = alpha * x[6 * inc_x]; \
|
||||
temp7 = alpha * x[7 * inc_x]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp = y[0]; \
|
||||
temp += temp0 * pa0[k]; \
|
||||
temp += temp1 * pa1[k]; \
|
||||
temp += temp2 * pa2[k]; \
|
||||
temp += temp3 * pa3[k]; \
|
||||
temp += temp4 * pa4[k]; \
|
||||
temp += temp5 * pa5[k]; \
|
||||
temp += temp6 * pa6[k]; \
|
||||
temp += temp7 * pa7[k]; \
|
||||
y[0] = temp; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
pa0 += 8 * lda; \
|
||||
pa1 += 8 * lda; \
|
||||
pa2 += 8 * lda; \
|
||||
pa3 += 8 * lda; \
|
||||
pa4 += 8 * lda; \
|
||||
pa5 += 8 * lda; \
|
||||
pa6 += 8 * lda; \
|
||||
pa7 += 8 * lda; \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
} \
|
||||
\
|
||||
if (n & 4) \
|
||||
{ \
|
||||
SLOAD_X4_SCALE(); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
SLOAD_Y8(); \
|
||||
SGEMV_N_8x4(); \
|
||||
SSTORE_Y8(); \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
SLOAD_Y4(); \
|
||||
SGEMV_N_4x4(); \
|
||||
SSTORE_Y4(); \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
temp2 = alpha * x[2 * inc_x]; \
|
||||
temp3 = alpha * x[3 * inc_x]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp = y[0]; \
|
||||
temp += temp0 * pa0[k]; \
|
||||
temp += temp1 * pa1[k]; \
|
||||
temp += temp2 * pa2[k]; \
|
||||
temp += temp3 * pa3[k]; \
|
||||
y[0] = temp; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
pa0 += 4 * lda; \
|
||||
pa1 += 4 * lda; \
|
||||
pa2 += 4 * lda; \
|
||||
pa3 += 4 * lda; \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
\
|
||||
tp0 = COPY_FLOAT_TO_VECTOR(temp0); \
|
||||
tp1 = COPY_FLOAT_TO_VECTOR(temp1); \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
SLOAD_Y8(); \
|
||||
SGEMV_N_8x2(); \
|
||||
SSTORE_Y8(); \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
SLOAD_Y4(); \
|
||||
SGEMV_N_4x2(); \
|
||||
SSTORE_Y4(); \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 3) \
|
||||
{ \
|
||||
temp0 = alpha * x[0 * inc_x]; \
|
||||
temp1 = alpha * x[1 * inc_x]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp = y[0]; \
|
||||
temp += temp0 * pa0[k]; \
|
||||
temp += temp1 * pa1[k]; \
|
||||
y[0] = temp; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
pa0 += 2 * lda; \
|
||||
pa1 += 2 * lda; \
|
||||
\
|
||||
x += 2 * inc_x; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
temp = alpha * x[0]; \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = m; i--;) \
|
||||
{ \
|
||||
y[0] += temp * pa0[k]; \
|
||||
\
|
||||
y += inc_y; \
|
||||
k++; \
|
||||
} \
|
||||
} \
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||
FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *y_org = y;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp, temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
v4f32 v_alpha, x0, x1, y0, y1;
|
||||
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
|
||||
|
||||
v_alpha = COPY_FLOAT_TO_VECTOR(alpha);
|
||||
|
||||
pa0 = A;
|
||||
pa1 = A + lda;
|
||||
pa2 = A + 2 * lda;
|
||||
pa3 = A + 3 * lda;
|
||||
pa4 = A + 4 * lda;
|
||||
pa5 = A + 5 * lda;
|
||||
pa6 = A + 6 * lda;
|
||||
pa7 = A + 7 * lda;
|
||||
|
||||
if ((1 == inc_x) && (1 == inc_y))
|
||||
{
|
||||
#define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR
|
||||
#define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR
|
||||
#define SLOAD_Y8 SLOAD_Y8_VECTOR
|
||||
#define SLOAD_Y4 SLOAD_Y4_VECTOR
|
||||
#define SSTORE_Y8 SSTORE_Y8_VECTOR
|
||||
#define SSTORE_Y4 SSTORE_Y4_VECTOR
|
||||
|
||||
SGEMV_N_MSA();
|
||||
|
||||
#undef SLOAD_X8_SCALE
|
||||
#undef SLOAD_X4_SCALE
|
||||
#undef SLOAD_Y8
|
||||
#undef SLOAD_Y4
|
||||
#undef SSTORE_Y8
|
||||
#undef SSTORE_Y4
|
||||
}
|
||||
else if (1 == inc_y)
|
||||
{
|
||||
#define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP
|
||||
#define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP
|
||||
#define SLOAD_Y8 SLOAD_Y8_VECTOR
|
||||
#define SLOAD_Y4 SLOAD_Y4_VECTOR
|
||||
#define SSTORE_Y8 SSTORE_Y8_VECTOR
|
||||
#define SSTORE_Y4 SSTORE_Y4_VECTOR
|
||||
|
||||
SGEMV_N_MSA();
|
||||
|
||||
#undef SLOAD_X8_SCALE
|
||||
#undef SLOAD_X4_SCALE
|
||||
#undef SLOAD_Y8
|
||||
#undef SLOAD_Y4
|
||||
#undef SSTORE_Y8
|
||||
#undef SSTORE_Y4
|
||||
}
|
||||
else if (1 == inc_x)
|
||||
{
|
||||
#define SLOAD_X8_SCALE SLOAD_X8_SCALE_VECTOR
|
||||
#define SLOAD_X4_SCALE SLOAD_X4_SCALE_VECTOR
|
||||
#define SLOAD_Y8 SLOAD_Y8_GP
|
||||
#define SLOAD_Y4 SLOAD_Y4_GP
|
||||
#define SSTORE_Y8 SSTORE_Y8_GP
|
||||
#define SSTORE_Y4 SSTORE_Y4_GP
|
||||
|
||||
SGEMV_N_MSA();
|
||||
|
||||
#undef SLOAD_X8_SCALE
|
||||
#undef SLOAD_X4_SCALE
|
||||
#undef SLOAD_Y8
|
||||
#undef SLOAD_Y4
|
||||
#undef SSTORE_Y8
|
||||
#undef SSTORE_Y4
|
||||
}
|
||||
else
|
||||
{
|
||||
#define SLOAD_X8_SCALE SLOAD_X8_SCALE_GP
|
||||
#define SLOAD_X4_SCALE SLOAD_X4_SCALE_GP
|
||||
#define SLOAD_Y8 SLOAD_Y8_GP
|
||||
#define SLOAD_Y4 SLOAD_Y4_GP
|
||||
#define SSTORE_Y8 SSTORE_Y8_GP
|
||||
#define SSTORE_Y4 SSTORE_Y4_GP
|
||||
|
||||
SGEMV_N_MSA();
|
||||
|
||||
#undef SLOAD_X8_SCALE
|
||||
#undef SLOAD_X4_SCALE
|
||||
#undef SLOAD_Y8
|
||||
#undef SLOAD_Y4
|
||||
#undef SSTORE_Y8
|
||||
#undef SSTORE_Y4
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,463 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define SGEMV_T_8x8() \
|
||||
{ \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t2, t3); \
|
||||
LD_SP2(pa2 + k, 4, t4, t5); \
|
||||
LD_SP2(pa3 + k, 4, t6, t7); \
|
||||
LD_SP2(pa4 + k, 4, t8, t9); \
|
||||
LD_SP2(pa5 + k, 4, t10, t11); \
|
||||
LD_SP2(pa6 + k, 4, t12, t13); \
|
||||
LD_SP2(pa7 + k, 4, t14, t15); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
\
|
||||
tp1 += x0 * t2; \
|
||||
tp1 += x1 * t3; \
|
||||
\
|
||||
tp2 += x0 * t4; \
|
||||
tp2 += x1 * t5; \
|
||||
\
|
||||
tp3 += x0 * t6; \
|
||||
tp3 += x1 * t7; \
|
||||
\
|
||||
tp4 += x0 * t8; \
|
||||
tp4 += x1 * t9; \
|
||||
\
|
||||
tp5 += x0 * t10; \
|
||||
tp5 += x1 * t11; \
|
||||
\
|
||||
tp6 += x0 * t12; \
|
||||
tp6 += x1 * t13; \
|
||||
\
|
||||
tp7 += x0 * t14; \
|
||||
tp7 += x1 * t15; \
|
||||
}
|
||||
|
||||
#define SGEMV_T_8x4() \
|
||||
{ \
|
||||
t0 = LD_SP(pa0 + k); \
|
||||
t2 = LD_SP(pa1 + k); \
|
||||
t4 = LD_SP(pa2 + k); \
|
||||
t6 = LD_SP(pa3 + k); \
|
||||
t8 = LD_SP(pa4 + k); \
|
||||
t10 = LD_SP(pa5 + k); \
|
||||
t12 = LD_SP(pa6 + k); \
|
||||
t14 = LD_SP(pa7 + k); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp1 += x0 * t2; \
|
||||
tp2 += x0 * t4; \
|
||||
tp3 += x0 * t6; \
|
||||
tp4 += x0 * t8; \
|
||||
tp5 += x0 * t10; \
|
||||
tp6 += x0 * t12; \
|
||||
tp7 += x0 * t14; \
|
||||
}
|
||||
|
||||
#define SGEMV_T_4x8() \
|
||||
{ \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t2, t3); \
|
||||
LD_SP2(pa2 + k, 4, t4, t5); \
|
||||
LD_SP2(pa3 + k, 4, t6, t7); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
\
|
||||
tp1 += x0 * t2; \
|
||||
tp1 += x1 * t3; \
|
||||
\
|
||||
tp2 += x0 * t4; \
|
||||
tp2 += x1 * t5; \
|
||||
\
|
||||
tp3 += x0 * t6; \
|
||||
tp3 += x1 * t7; \
|
||||
}
|
||||
|
||||
#define SGEMV_T_4x4() \
|
||||
{ \
|
||||
t0 = LD_SP(pa0 + k); \
|
||||
t2 = LD_SP(pa1 + k); \
|
||||
t4 = LD_SP(pa2 + k); \
|
||||
t6 = LD_SP(pa3 + k); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp1 += x0 * t2; \
|
||||
tp2 += x0 * t4; \
|
||||
tp3 += x0 * t6; \
|
||||
}
|
||||
|
||||
#define SGEMV_T_2x8() \
|
||||
{ \
|
||||
LD_SP2(pa0 + k, 4, t0, t1); \
|
||||
LD_SP2(pa1 + k, 4, t2, t3); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp0 += x1 * t1; \
|
||||
\
|
||||
tp1 += x0 * t2; \
|
||||
tp1 += x1 * t3; \
|
||||
}
|
||||
|
||||
#define SGEMV_T_2x4() \
|
||||
{ \
|
||||
t0 = LD_SP(pa0 + k); \
|
||||
t2 = LD_SP(pa1 + k); \
|
||||
\
|
||||
tp0 += x0 * t0; \
|
||||
tp1 += x0 * t2; \
|
||||
}
|
||||
|
||||
#define SLOAD_X8_GP() \
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 4 * inc_x))); \
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 1, *((int *)(x + 5 * inc_x))); \
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 2, *((int *)(x + 6 * inc_x))); \
|
||||
x1 = (v4f32) __msa_insert_w((v4i32) x1, 3, *((int *)(x + 7 * inc_x))); \
|
||||
|
||||
#define SLOAD_X4_GP() \
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) tp0, 0, *((int *)(x + 0 * inc_x))); \
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 1, *((int *)(x + 1 * inc_x))); \
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 2, *((int *)(x + 2 * inc_x))); \
|
||||
x0 = (v4f32) __msa_insert_w((v4i32) x0, 3, *((int *)(x + 3 * inc_x))); \
|
||||
|
||||
#define SLOAD_X8_VECTOR() LD_SP2(x, 4, x0, x1);
|
||||
#define SLOAD_X4_VECTOR() x0 = LD_SP(x);
|
||||
|
||||
#define SGEMV_T_MSA() \
|
||||
for (j = (n >> 3); j--;) \
|
||||
{ \
|
||||
tp0 = zero; \
|
||||
tp1 = zero; \
|
||||
tp2 = zero; \
|
||||
tp3 = zero; \
|
||||
tp4 = zero; \
|
||||
tp5 = zero; \
|
||||
tp6 = zero; \
|
||||
tp7 = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
SLOAD_X8(); \
|
||||
SGEMV_T_8x8(); \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
SLOAD_X4(); \
|
||||
SGEMV_T_8x4(); \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \
|
||||
tp0, tp1, tp2, tp3); \
|
||||
TRANSPOSE4x4_SP_SP(tp4, tp5, tp6, tp7, \
|
||||
tp4, tp5, tp6, tp7); \
|
||||
tp0 += tp1; \
|
||||
tp0 += tp2; \
|
||||
tp0 += tp3; \
|
||||
tp4 += tp5; \
|
||||
tp4 += tp6; \
|
||||
tp4 += tp7; \
|
||||
\
|
||||
temp0 = tp0[0]; \
|
||||
temp1 = tp0[1]; \
|
||||
temp2 = tp0[2]; \
|
||||
temp3 = tp0[3]; \
|
||||
temp4 = tp4[0]; \
|
||||
temp5 = tp4[1]; \
|
||||
temp6 = tp4[2]; \
|
||||
temp7 = tp4[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
temp1 += pa1[k] * x[0]; \
|
||||
temp2 += pa2[k] * x[0]; \
|
||||
temp3 += pa3[k] * x[0]; \
|
||||
temp4 += pa4[k] * x[0]; \
|
||||
temp5 += pa5[k] * x[0]; \
|
||||
temp6 += pa6[k] * x[0]; \
|
||||
temp7 += pa7[k] * x[0]; \
|
||||
\
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
res0 = y[0 * inc_y]; \
|
||||
res1 = y[1 * inc_y]; \
|
||||
res2 = y[2 * inc_y]; \
|
||||
res3 = y[3 * inc_y]; \
|
||||
res4 = y[4 * inc_y]; \
|
||||
res5 = y[5 * inc_y]; \
|
||||
res6 = y[6 * inc_y]; \
|
||||
res7 = y[7 * inc_y]; \
|
||||
\
|
||||
res0 += alpha * temp0; \
|
||||
res1 += alpha * temp1; \
|
||||
res2 += alpha * temp2; \
|
||||
res3 += alpha * temp3; \
|
||||
res4 += alpha * temp4; \
|
||||
res5 += alpha * temp5; \
|
||||
res6 += alpha * temp6; \
|
||||
res7 += alpha * temp7; \
|
||||
\
|
||||
y[0 * inc_y] = res0; \
|
||||
y[1 * inc_y] = res1; \
|
||||
y[2 * inc_y] = res2; \
|
||||
y[3 * inc_y] = res3; \
|
||||
y[4 * inc_y] = res4; \
|
||||
y[5 * inc_y] = res5; \
|
||||
y[6 * inc_y] = res6; \
|
||||
y[7 * inc_y] = res7; \
|
||||
\
|
||||
y += 8 * inc_y; \
|
||||
\
|
||||
pa0 += 8 * lda; \
|
||||
pa1 += 8 * lda; \
|
||||
pa2 += 8 * lda; \
|
||||
pa3 += 8 * lda; \
|
||||
pa4 += 8 * lda; \
|
||||
pa5 += 8 * lda; \
|
||||
pa6 += 8 * lda; \
|
||||
pa7 += 8 * lda; \
|
||||
} \
|
||||
\
|
||||
if (n & 4) \
|
||||
{ \
|
||||
tp0 = zero; \
|
||||
tp1 = zero; \
|
||||
tp2 = zero; \
|
||||
tp3 = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
SLOAD_X8(); \
|
||||
SGEMV_T_4x8(); \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
SLOAD_X4(); \
|
||||
SGEMV_T_4x4(); \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
TRANSPOSE4x4_SP_SP(tp0, tp1, tp2, tp3, \
|
||||
tp0, tp1, tp2, tp3); \
|
||||
tp0 += tp1; \
|
||||
tp0 += tp2; \
|
||||
tp0 += tp3; \
|
||||
\
|
||||
temp0 = tp0[0]; \
|
||||
temp1 = tp0[1]; \
|
||||
temp2 = tp0[2]; \
|
||||
temp3 = tp0[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
temp1 += pa1[k] * x[0]; \
|
||||
temp2 += pa2[k] * x[0]; \
|
||||
temp3 += pa3[k] * x[0]; \
|
||||
\
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
res0 = y[0 * inc_y]; \
|
||||
res1 = y[1 * inc_y]; \
|
||||
res2 = y[2 * inc_y]; \
|
||||
res3 = y[3 * inc_y]; \
|
||||
\
|
||||
res0 += alpha * temp0; \
|
||||
res1 += alpha * temp1; \
|
||||
res2 += alpha * temp2; \
|
||||
res3 += alpha * temp3; \
|
||||
\
|
||||
y[0 * inc_y] = res0; \
|
||||
y[1 * inc_y] = res1; \
|
||||
y[2 * inc_y] = res2; \
|
||||
y[3 * inc_y] = res3; \
|
||||
\
|
||||
y += 4 * inc_y; \
|
||||
\
|
||||
pa0 += 4 * lda; \
|
||||
pa1 += 4 * lda; \
|
||||
pa2 += 4 * lda; \
|
||||
pa3 += 4 * lda; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
tp0 = zero; \
|
||||
tp1 = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 3); i--;) \
|
||||
{ \
|
||||
SLOAD_X8(); \
|
||||
SGEMV_T_2x8(); \
|
||||
\
|
||||
x += 8 * inc_x; \
|
||||
k += 8; \
|
||||
} \
|
||||
\
|
||||
if (m & 4) \
|
||||
{ \
|
||||
SLOAD_X4(); \
|
||||
SGEMV_T_2x4(); \
|
||||
\
|
||||
x += 4 * inc_x; \
|
||||
k += 4; \
|
||||
} \
|
||||
\
|
||||
ILVRL_W2_SP(tp1, tp0, tp2, tp3); \
|
||||
\
|
||||
tp2 += tp3; \
|
||||
\
|
||||
temp0 = tp2[0] + tp2[2]; \
|
||||
temp1 = tp2[1] + tp2[3]; \
|
||||
\
|
||||
for (i = (m & 3); i--;) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
temp1 += pa1[k] * x[0]; \
|
||||
\
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
res0 = y[0 * inc_y]; \
|
||||
res1 = y[1 * inc_y]; \
|
||||
\
|
||||
res0 += alpha * temp0; \
|
||||
res1 += alpha * temp1; \
|
||||
\
|
||||
y[0 * inc_y] = res0; \
|
||||
y[1 * inc_y] = res1; \
|
||||
\
|
||||
y += 2 * inc_y; \
|
||||
\
|
||||
pa0 += 2 * lda; \
|
||||
pa1 += 2 * lda; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
temp0 = 0.0; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = m; i--;) \
|
||||
{ \
|
||||
temp0 += pa0[k] * x[0]; \
|
||||
\
|
||||
x += inc_x; \
|
||||
k++; \
|
||||
} \
|
||||
\
|
||||
y[0] += alpha * temp0; \
|
||||
y += inc_y; \
|
||||
pa0 += lda; \
|
||||
}
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *A,
|
||||
BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
|
||||
FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *srcx_org = x;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3, *pa4, *pa5, *pa6, *pa7;
|
||||
FLOAT temp0, temp1, temp2, temp3, temp4, temp5, temp6, temp7;
|
||||
FLOAT res0, res1, res2, res3, res4, res5, res6, res7;
|
||||
v4f32 x0, x1;
|
||||
v4f32 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v4f32 tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
|
||||
v4f32 zero = {0};
|
||||
|
||||
pa0 = A + 0 * lda;
|
||||
pa1 = A + 1 * lda;
|
||||
pa2 = A + 2 * lda;
|
||||
pa3 = A + 3 * lda;
|
||||
pa4 = A + 4 * lda;
|
||||
pa5 = A + 5 * lda;
|
||||
pa6 = A + 6 * lda;
|
||||
pa7 = A + 7 * lda;
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
#define SLOAD_X8 SLOAD_X8_VECTOR
|
||||
#define SLOAD_X4 SLOAD_X4_VECTOR
|
||||
|
||||
SGEMV_T_MSA();
|
||||
|
||||
#undef SLOAD_X8
|
||||
#undef SLOAD_X4
|
||||
}
|
||||
else
|
||||
{
|
||||
#define SLOAD_X8 SLOAD_X8_GP
|
||||
#define SLOAD_X4 SLOAD_X4_GP
|
||||
|
||||
SGEMV_T_MSA();
|
||||
|
||||
#undef SLOAD_X8
|
||||
#undef SLOAD_X4
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
|
@ -0,0 +1,170 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <math.h>
|
||||
#include "macros_msa.h"
|
||||
|
||||
#define AND_VEC_D(in) ((v2f64) ((v2i64) in & and_vec))
|
||||
|
||||
#define PROCESS_ZD(inc_val) \
|
||||
if (n > 8) \
|
||||
{ \
|
||||
n -= 8; \
|
||||
\
|
||||
LD_DP8_INC(x, inc_val, src0, src1, src2, \
|
||||
src3, src4, src5, src6, src7); \
|
||||
\
|
||||
sum_abs0 = AND_VEC_D(src0); \
|
||||
sum_abs1 = AND_VEC_D(src1); \
|
||||
sum_abs2 = AND_VEC_D(src2); \
|
||||
sum_abs3 = AND_VEC_D(src3); \
|
||||
sum_abs0 += AND_VEC_D(src4); \
|
||||
sum_abs1 += AND_VEC_D(src5); \
|
||||
sum_abs2 += AND_VEC_D(src6); \
|
||||
sum_abs3 += AND_VEC_D(src7); \
|
||||
} \
|
||||
else \
|
||||
{ \
|
||||
sum_abs0 = zero_v; \
|
||||
sum_abs1 = zero_v; \
|
||||
sum_abs2 = zero_v; \
|
||||
sum_abs3 = zero_v; \
|
||||
} \
|
||||
\
|
||||
for (i = (n >> 3); i--;) \
|
||||
{ \
|
||||
LD_DP8_INC(x, inc_val, src0, src1, src2, \
|
||||
src3, src4, src5, src6, src7); \
|
||||
\
|
||||
sum_abs0 += AND_VEC_D(src0); \
|
||||
sum_abs1 += AND_VEC_D(src1); \
|
||||
sum_abs2 += AND_VEC_D(src2); \
|
||||
sum_abs3 += AND_VEC_D(src3); \
|
||||
sum_abs0 += AND_VEC_D(src4); \
|
||||
sum_abs1 += AND_VEC_D(src5); \
|
||||
sum_abs2 += AND_VEC_D(src6); \
|
||||
sum_abs3 += AND_VEC_D(src7); \
|
||||
} \
|
||||
\
|
||||
if (n & 7) \
|
||||
{ \
|
||||
if ((n & 4) && (n & 2) && (n & 1)) \
|
||||
{ \
|
||||
LD_DP7_INC(x, inc_val, src0, src1, src2, \
|
||||
src3, src4, src5, src6); \
|
||||
\
|
||||
sum_abs0 += AND_VEC_D(src0); \
|
||||
sum_abs1 += AND_VEC_D(src1); \
|
||||
sum_abs2 += AND_VEC_D(src2); \
|
||||
sum_abs3 += AND_VEC_D(src3); \
|
||||
sum_abs0 += AND_VEC_D(src4); \
|
||||
sum_abs1 += AND_VEC_D(src5); \
|
||||
sum_abs2 += AND_VEC_D(src6); \
|
||||
} \
|
||||
else if ((n & 4) && (n & 2)) \
|
||||
{ \
|
||||
LD_DP6_INC(x, inc_val, src0, src1, src2, \
|
||||
src3, src4, src5); \
|
||||
\
|
||||
sum_abs0 += AND_VEC_D(src0); \
|
||||
sum_abs1 += AND_VEC_D(src1); \
|
||||
sum_abs2 += AND_VEC_D(src2); \
|
||||
sum_abs3 += AND_VEC_D(src3); \
|
||||
sum_abs0 += AND_VEC_D(src4); \
|
||||
sum_abs1 += AND_VEC_D(src5); \
|
||||
} \
|
||||
else if ((n & 4) && (n & 1)) \
|
||||
{ \
|
||||
LD_DP5_INC(x, inc_val, src0, src1, src2, \
|
||||
src3, src4); \
|
||||
\
|
||||
sum_abs0 += AND_VEC_D(src0); \
|
||||
sum_abs1 += AND_VEC_D(src1); \
|
||||
sum_abs2 += AND_VEC_D(src2); \
|
||||
sum_abs3 += AND_VEC_D(src3); \
|
||||
sum_abs0 += AND_VEC_D(src4); \
|
||||
} \
|
||||
else if ((n & 2) && (n & 1)) \
|
||||
{ \
|
||||
LD_DP3_INC(x, inc_val, src0, src1, src2); \
|
||||
\
|
||||
sum_abs0 += AND_VEC_D(src0); \
|
||||
sum_abs1 += AND_VEC_D(src1); \
|
||||
sum_abs2 += AND_VEC_D(src2); \
|
||||
} \
|
||||
else if (n & 4) \
|
||||
{ \
|
||||
LD_DP4_INC(x, inc_val, src0, src1, src2, \
|
||||
src3); \
|
||||
\
|
||||
sum_abs0 += AND_VEC_D(src0); \
|
||||
sum_abs1 += AND_VEC_D(src1); \
|
||||
sum_abs2 += AND_VEC_D(src2); \
|
||||
sum_abs3 += AND_VEC_D(src3); \
|
||||
} \
|
||||
else if (n & 2) \
|
||||
{ \
|
||||
LD_DP2_INC(x, inc_val, src0, src1); \
|
||||
\
|
||||
sum_abs0 += AND_VEC_D(src0); \
|
||||
sum_abs1 += AND_VEC_D(src1); \
|
||||
} \
|
||||
else if (n & 1) \
|
||||
{ \
|
||||
src0 = LD_DP(x); \
|
||||
\
|
||||
sum_abs0 += AND_VEC_D(src0); \
|
||||
} \
|
||||
} \
|
||||
\
|
||||
sum_abs0 += sum_abs1 + sum_abs2 + sum_abs3; \
|
||||
sumf = sum_abs0[0] + sum_abs0[1];
|
||||
|
||||
FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||
{
|
||||
BLASLONG i;
|
||||
FLOAT sumf = 0.0;
|
||||
v2f64 src0, src1, src2, src3, src4, src5, src6, src7;
|
||||
v2f64 sum_abs0, sum_abs1, sum_abs2, sum_abs3;
|
||||
v2f64 zero_v = {0};
|
||||
v2i64 and_vec = {0x7FFFFFFFFFFFFFFF, 0x7FFFFFFFFFFFFFFF};
|
||||
|
||||
if (n <= 0 || inc_x <= 0) return (sumf);
|
||||
|
||||
if (1 == inc_x)
|
||||
{
|
||||
PROCESS_ZD(2);
|
||||
}
|
||||
else
|
||||
{
|
||||
inc_x *= 2;
|
||||
PROCESS_ZD(inc_x);
|
||||
}
|
||||
|
||||
return (sumf);
|
||||
}
|
|
@ -0,0 +1,227 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#if !defined(CONJ)
|
||||
#define OP2 +=
|
||||
#define OP3 -
|
||||
#define OP4 +
|
||||
#else
|
||||
#define OP2 -=
|
||||
#define OP3 +
|
||||
#define OP4 -
|
||||
#endif
|
||||
|
||||
#define DOT16_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i); \
|
||||
\
|
||||
dot0 += (vx1r * vy1r); \
|
||||
dot0 OPR0## = (vx1i * vy1i); \
|
||||
dot1 OPR1## = (vx1i * vy1r); \
|
||||
dot1 += (vx1r * vy1i); \
|
||||
\
|
||||
dot0 += (vx2r * vy2r); \
|
||||
dot0 OPR0## = (vx2i * vy2i); \
|
||||
dot1 OPR1## = (vx2i * vy2r); \
|
||||
dot1 += (vx2r * vy2i); \
|
||||
\
|
||||
dot0 += (vx3r * vy3r); \
|
||||
dot0 OPR0## = (vx3i * vy3i); \
|
||||
dot1 OPR1## = (vx3i * vy3r); \
|
||||
dot1 += (vx3r * vy3i);
|
||||
|
||||
#define DOT12_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i); \
|
||||
\
|
||||
dot0 += (vx1r * vy1r); \
|
||||
dot0 OPR0## = (vx1i * vy1i); \
|
||||
dot1 OPR1## = (vx1i * vy1r); \
|
||||
dot1 += (vx1r * vy1i); \
|
||||
\
|
||||
dot0 += (vx2r * vy2r); \
|
||||
dot0 OPR0## = (vx2i * vy2i); \
|
||||
dot1 OPR1## = (vx2i * vy2r); \
|
||||
dot1 += (vx2r * vy2i);
|
||||
|
||||
#define DOT8_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i); \
|
||||
\
|
||||
dot0 += (vx1r * vy1r); \
|
||||
dot0 OPR0## = (vx1i * vy1i); \
|
||||
dot1 OPR1## = (vx1i * vy1r); \
|
||||
dot1 += (vx1r * vy1i);
|
||||
|
||||
#define DOT4_KERNEL(OPR0, OPR1) \
|
||||
dot0 += (vx0r * vy0r); \
|
||||
dot0 OPR0## = (vx0i * vy0i); \
|
||||
dot1 OPR1## = (vx0i * vy0r); \
|
||||
dot1 += (vx0r * vy0i);
|
||||
|
||||
/* return double, x,y double */
|
||||
/* zdotc - CONJ */
|
||||
/* zdotu - !CONJ */
|
||||
#ifndef _MSC_VER
|
||||
#include <complex.h>
|
||||
FLOAT _Complex CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#else
|
||||
OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
|
||||
#endif
|
||||
{
|
||||
BLASLONG i = 0;
|
||||
FLOAT dot[2];
|
||||
BLASLONG inc_x2;
|
||||
BLASLONG inc_y2;
|
||||
v2f64 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7;
|
||||
v2f64 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7;
|
||||
v2f64 vx0r, vx0i, vx1r, vx1i, vx2r, vx2i, vx3r, vx3i;
|
||||
v2f64 vy0r, vy0i, vy1r, vy1i, vy2r, vy2i, vy3r, vy3i;
|
||||
v2f64 dot0 = {0, 0};
|
||||
v2f64 dot1 = {0, 0};
|
||||
v2f64 zero = {0, 0};
|
||||
openblas_complex_double result;
|
||||
|
||||
dot[0] = 0.0;
|
||||
dot[1] = 0.0;
|
||||
|
||||
__real__(result) = 0.0;
|
||||
__imag__(result) = 0.0;
|
||||
|
||||
if ( n < 1 ) return(result);
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
for (i = (n >> 3); i--;)
|
||||
{
|
||||
LD_DP8_INC(x, inc_x2, vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7);
|
||||
LD_DP8_INC(y, inc_y2, vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7);
|
||||
|
||||
PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
|
||||
PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
|
||||
PCKEVOD_D2_DP(vx7, vx6, vx3r, vx3i);
|
||||
|
||||
PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
|
||||
PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
|
||||
PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
|
||||
PCKEVOD_D2_DP(vy7, vy6, vy3r, vy3i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT16_KERNEL(-, +);
|
||||
#else
|
||||
DOT16_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 7)
|
||||
{
|
||||
if ((n & 4) && (n & 2))
|
||||
{
|
||||
LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
|
||||
LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
|
||||
LD_DP2_INC(x, inc_x2, vx4, vx5);
|
||||
LD_DP2_INC(y, inc_y2, vy4, vy5);
|
||||
|
||||
PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
|
||||
PCKEVOD_D2_DP(vx5, vx4, vx2r, vx2i);
|
||||
|
||||
PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
|
||||
PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
|
||||
PCKEVOD_D2_DP(vy5, vy4, vy2r, vy2i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT12_KERNEL(-, +);
|
||||
#else
|
||||
DOT12_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
else if (n & 4)
|
||||
{
|
||||
LD_DP4_INC(x, inc_x2, vx0, vx1, vx2, vx3);
|
||||
LD_DP4_INC(y, inc_y2, vy0, vy1, vy2, vy3);
|
||||
|
||||
PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_D2_DP(vx3, vx2, vx1r, vx1i);
|
||||
|
||||
PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
|
||||
PCKEVOD_D2_DP(vy3, vy2, vy1r, vy1i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT8_KERNEL(-, +);
|
||||
#else
|
||||
DOT8_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
else if (n & 2)
|
||||
{
|
||||
LD_DP2_INC(x, inc_x2, vx0, vx1);
|
||||
LD_DP2_INC(y, inc_y2, vy0, vy1);
|
||||
PCKEVOD_D2_DP(vx1, vx0, vx0r, vx0i);
|
||||
PCKEVOD_D2_DP(vy1, vy0, vy0r, vy0i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT4_KERNEL(-, +);
|
||||
#else
|
||||
DOT4_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
|
||||
if (n & 1)
|
||||
{
|
||||
vx0 = LD_DP(x);
|
||||
vy0 = LD_DP(y);
|
||||
PCKEVOD_D2_DP(zero, vx0, vx0r, vx0i);
|
||||
PCKEVOD_D2_DP(zero, vy0, vy0r, vy0i);
|
||||
|
||||
#if !defined(CONJ)
|
||||
DOT4_KERNEL(-, +);
|
||||
#else
|
||||
DOT4_KERNEL(+, -);
|
||||
#endif
|
||||
}
|
||||
}
|
||||
|
||||
dot[0] += (dot0[0] + dot0[1]);
|
||||
dot[1] += (dot1[0] + dot1[1]);
|
||||
|
||||
__real__(result) = dot[0];
|
||||
__imag__(result) = dot[1];
|
||||
|
||||
return(result);
|
||||
}
|
|
@ -0,0 +1,667 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
#undef OP3
|
||||
#undef OP4
|
||||
|
||||
#if !defined(XCONJ)
|
||||
#define OP3 -=
|
||||
#define OP4 +=
|
||||
#else
|
||||
#define OP3 +=
|
||||
#define OP4 -=
|
||||
#endif
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
#else
|
||||
#if !defined(XCONJ)
|
||||
#define OP0 +=
|
||||
#define OP1 -=
|
||||
#define OP2 -=
|
||||
#else
|
||||
#define OP0 -=
|
||||
#define OP1 -=
|
||||
#define OP2 +=
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#define ZGEMV_N_4x4() \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_D2_DP(t7, t6, src3r, src3i); \
|
||||
PCKEVOD_D2_DP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_D2_DP(t11, t10, src5r, src5i); \
|
||||
PCKEVOD_D2_DP(t13, t12, src6r, src6i); \
|
||||
PCKEVOD_D2_DP(t15, t14, src7r, src7i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y1r += tp0r * src1r; \
|
||||
y0r += tp1r * src2r; \
|
||||
y1r += tp1r * src3r; \
|
||||
y0r += tp2r * src4r; \
|
||||
y1r += tp2r * src5r; \
|
||||
y0r += tp3r * src6r; \
|
||||
y1r += tp3r * src7r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y1r OP0 tp0i * src1i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
y1r OP0 tp1i * src3i; \
|
||||
y0r OP0 tp2i * src4i; \
|
||||
y1r OP0 tp2i * src5i; \
|
||||
y0r OP0 tp3i * src6i; \
|
||||
y1r OP0 tp3i * src7i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y1i OP1 tp0r * src1i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
y1i OP1 tp1r * src3i; \
|
||||
y0i OP1 tp2r * src4i; \
|
||||
y1i OP1 tp2r * src5i; \
|
||||
y0i OP1 tp3r * src6i; \
|
||||
y1i OP1 tp3r * src7i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y1i OP2 tp0i * src1r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
y1i OP2 tp1i * src3r; \
|
||||
y0i OP2 tp2i * src4r; \
|
||||
y1i OP2 tp2i * src5r; \
|
||||
y0i OP2 tp3i * src6r; \
|
||||
y1i OP2 tp3i * src7r; \
|
||||
|
||||
#define ZGEMV_N_2x4() \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_D2_DP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_D2_DP(t13, t12, src6r, src6i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y0r += tp1r * src2r; \
|
||||
y0r += tp2r * src4r; \
|
||||
y0r += tp3r * src6r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
y0r OP0 tp2i * src4i; \
|
||||
y0r OP0 tp3i * src6i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
y0i OP1 tp2r * src4i; \
|
||||
y0i OP1 tp3r * src6i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
y0i OP2 tp2i * src4r; \
|
||||
y0i OP2 tp3i * src6r; \
|
||||
|
||||
#define ZGEMV_N_1x4() \
|
||||
res0 = y[0 * inc_y2]; \
|
||||
res1 = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0 += temp0_r * pa0[k]; \
|
||||
res0 OP0 temp0_i * pa0[k + 1]; \
|
||||
res0 += temp1_r * pa1[k]; \
|
||||
res0 OP0 temp1_i * pa1[k + 1]; \
|
||||
res0 += temp2_r * pa2[k]; \
|
||||
res0 OP0 temp2_i * pa2[k + 1]; \
|
||||
res0 += temp3_r * pa3[k]; \
|
||||
res0 OP0 temp3_i * pa3[k + 1]; \
|
||||
\
|
||||
res1 OP1 temp0_r * pa0[k + 1]; \
|
||||
res1 OP2 temp0_i * pa0[k]; \
|
||||
res1 OP1 temp1_r * pa1[k + 1]; \
|
||||
res1 OP2 temp1_i * pa1[k]; \
|
||||
res1 OP1 temp2_r * pa2[k + 1]; \
|
||||
res1 OP2 temp2_i * pa2[k]; \
|
||||
res1 OP1 temp3_r * pa3[k + 1]; \
|
||||
res1 OP2 temp3_i * pa3[k]; \
|
||||
\
|
||||
y[0 * inc_y2] = res0; \
|
||||
y[0 * inc_y2 + 1] = res1; \
|
||||
|
||||
#define ZGEMV_N_4x2() \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_D2_DP(t7, t6, src3r, src3i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y1r += tp0r * src1r; \
|
||||
y0r += tp1r * src2r; \
|
||||
y1r += tp1r * src3r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y1r OP0 tp0i * src1i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
y1r OP0 tp1i * src3i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y1i OP1 tp0r * src1i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
y1i OP1 tp1r * src3i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y1i OP2 tp0i * src1r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
y1i OP2 tp1i * src3r; \
|
||||
|
||||
#define ZGEMV_N_2x2() \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y0r += tp1r * src2r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y0r OP0 tp1i * src2i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y0i OP1 tp1r * src2i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y0i OP2 tp1i * src2r; \
|
||||
|
||||
#define ZGEMV_N_1x2() \
|
||||
res0 = y[0 * inc_y2]; \
|
||||
res1 = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0 += temp0_r * pa0[k]; \
|
||||
res0 OP0 temp0_i * pa0[k + 1]; \
|
||||
res0 += temp1_r * pa1[k]; \
|
||||
res0 OP0 temp1_i * pa1[k + 1]; \
|
||||
\
|
||||
res1 OP1 temp0_r * pa0[k + 1]; \
|
||||
res1 OP2 temp0_i * pa0[k]; \
|
||||
res1 OP1 temp1_r * pa1[k + 1]; \
|
||||
res1 OP2 temp1_i * pa1[k]; \
|
||||
\
|
||||
y[0 * inc_y2] = res0; \
|
||||
y[0 * inc_y2 + 1] = res1; \
|
||||
|
||||
#define ZGEMV_N_4x1() \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y1r += tp0r * src1r; \
|
||||
\
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y1r OP0 tp0i * src1i; \
|
||||
\
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y1i OP1 tp0r * src1i; \
|
||||
\
|
||||
y0i OP2 tp0i * src0r; \
|
||||
y1i OP2 tp0i * src1r; \
|
||||
|
||||
#define ZGEMV_N_2x1() \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
\
|
||||
y0r += tp0r * src0r; \
|
||||
y0r OP0 tp0i * src0i; \
|
||||
y0i OP1 tp0r * src0i; \
|
||||
y0i OP2 tp0i * src0r; \
|
||||
|
||||
#define ZGEMV_N_1x1() \
|
||||
res0 = y[0 * inc_y2]; \
|
||||
res1 = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0 += temp0_r * pa0[k]; \
|
||||
res0 OP0 temp0_i * pa0[k + 1]; \
|
||||
\
|
||||
res1 OP1 temp0_r * pa0[k + 1]; \
|
||||
res1 OP2 temp0_i * pa0[k]; \
|
||||
\
|
||||
y[0 * inc_y2] = res0; \
|
||||
y[0 * inc_y2 + 1] = res1; \
|
||||
|
||||
#define ZLOAD_X4_SCALE_VECTOR() \
|
||||
LD_DP4(x, 2, x0, x1, x2, x3); \
|
||||
\
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i); \
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i); \
|
||||
\
|
||||
tp4r = alphar * x0r; \
|
||||
tp4r OP3 alphai * x0i; \
|
||||
tp4i = alphar * x0i; \
|
||||
tp4i OP4 alphai * x0r; \
|
||||
\
|
||||
tp5r = alphar * x1r; \
|
||||
tp5r OP3 alphai * x1i; \
|
||||
tp5i = alphar * x1i; \
|
||||
tp5i OP4 alphai * x1r; \
|
||||
\
|
||||
SPLATI_D2_DP(tp4r, tp0r, tp1r); \
|
||||
SPLATI_D2_DP(tp5r, tp2r, tp3r); \
|
||||
SPLATI_D2_DP(tp4i, tp0i, tp1i); \
|
||||
SPLATI_D2_DP(tp5i, tp2i, tp3i); \
|
||||
|
||||
#define ZLOAD_X2_SCALE_VECTOR() \
|
||||
LD_DP2(x, 2, x0, x1); \
|
||||
\
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i); \
|
||||
\
|
||||
tp4r = alphar * x0r; \
|
||||
tp4r OP3 alphai * x0i; \
|
||||
tp4i = alphar * x0i; \
|
||||
tp4i OP4 alphai * x0r; \
|
||||
\
|
||||
SPLATI_D2_DP(tp4r, tp0r, tp1r); \
|
||||
SPLATI_D2_DP(tp4i, tp0i, tp1i); \
|
||||
|
||||
#define ZLOAD_X4_SCALE_GP() \
|
||||
x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2))); \
|
||||
x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *)(x + 1 * inc_x2))); \
|
||||
x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 2 * inc_x2))); \
|
||||
x1r = (v2f64) __msa_insert_d((v2i64) x1r, 1, *((long long *)(x + 3 * inc_x2))); \
|
||||
x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *)(x + 1 * inc_x2 + 1))); \
|
||||
x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 2 * inc_x2 + 1))); \
|
||||
x1i = (v2f64) __msa_insert_d((v2i64) x1i, 1, *((long long *)(x + 3 * inc_x2 + 1))); \
|
||||
\
|
||||
tp4r = alphar * x0r; \
|
||||
tp4r OP3 alphai * x0i; \
|
||||
tp4i = alphar * x0i; \
|
||||
tp4i OP4 alphai * x0r; \
|
||||
\
|
||||
tp5r = alphar * x1r; \
|
||||
tp5r OP3 alphai * x1i; \
|
||||
tp5i = alphar * x1i; \
|
||||
tp5i OP4 alphai * x1r; \
|
||||
\
|
||||
SPLATI_D2_DP(tp4r, tp0r, tp1r); \
|
||||
SPLATI_D2_DP(tp5r, tp2r, tp3r); \
|
||||
SPLATI_D2_DP(tp4i, tp0i, tp1i); \
|
||||
SPLATI_D2_DP(tp5i, tp2i, tp3i); \
|
||||
|
||||
#define ZLOAD_X2_SCALE_GP() \
|
||||
x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2))); \
|
||||
x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *)(x + 1 * inc_x2))); \
|
||||
x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *)(x + 1 * inc_x2 + 1))); \
|
||||
\
|
||||
tp4r = alphar * x0r; \
|
||||
tp4r OP3 alphai * x0i; \
|
||||
tp4i = alphar * x0i; \
|
||||
tp4i OP4 alphai * x0r; \
|
||||
\
|
||||
SPLATI_D2_DP(tp4r, tp0r, tp1r); \
|
||||
SPLATI_D2_DP(tp4i, tp0i, tp1i); \
|
||||
|
||||
#define ZLOAD_X1_SCALE_GP() \
|
||||
temp0_r = alpha_r * x[0 * inc_x2]; \
|
||||
temp0_r OP3 alpha_i * x[0 * inc_x2 + 1]; \
|
||||
temp0_i = alpha_r * x[0 * inc_x2 + 1]; \
|
||||
temp0_i OP4 alpha_i * x[0 * inc_x2]; \
|
||||
\
|
||||
tp0r = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_r); \
|
||||
tp0i = (v2f64) COPY_DOUBLE_TO_VECTOR(temp0_i); \
|
||||
|
||||
#define ZLOAD_Y4_VECTOR() \
|
||||
LD_DP4(y, 2, y0, y1, y2, y3); \
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i); \
|
||||
PCKEVOD_D2_DP(y3, y2, y1r, y1i); \
|
||||
|
||||
#define ZLOAD_Y2_VECTOR() \
|
||||
LD_DP2(y, 2, y0, y1); \
|
||||
PCKEVOD_D2_DP(y1, y0, y0r, y0i); \
|
||||
|
||||
#define ZSTORE_Y4_VECTOR() \
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1); \
|
||||
ILVRL_D2_DP(y1i, y1r, y2, y3); \
|
||||
ST_DP4(y0, y1, y2, y3, y, 2); \
|
||||
|
||||
#define ZSTORE_Y2_VECTOR() \
|
||||
ILVRL_D2_DP(y0i, y0r, y0, y1); \
|
||||
ST_DP2(y0, y1, y, 2); \
|
||||
|
||||
#define ZLOAD_Y4_GP() \
|
||||
y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 0 * inc_y2))); \
|
||||
y0r = (v2f64) __msa_insert_d((v2i64) y0r, 1, *((long long *)(y + 1 * inc_y2))); \
|
||||
y1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 2 * inc_y2))); \
|
||||
y1r = (v2f64) __msa_insert_d((v2i64) y1r, 1, *((long long *)(y + 3 * inc_y2))); \
|
||||
y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 0 * inc_y2 + 1))); \
|
||||
y0i = (v2f64) __msa_insert_d((v2i64) y0i, 1, *((long long *)(y + 1 * inc_y2 + 1))); \
|
||||
y1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 2 * inc_y2 + 1))); \
|
||||
y1i = (v2f64) __msa_insert_d((v2i64) y1i, 1, *((long long *)(y + 3 * inc_y2 + 1))); \
|
||||
|
||||
#define ZLOAD_Y2_GP() \
|
||||
y0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 0 * inc_y2))); \
|
||||
y0r = (v2f64) __msa_insert_d((v2i64) y0r, 1, *((long long *)(y + 1 * inc_y2))); \
|
||||
y0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *)(y + 0 * inc_y2 + 1))); \
|
||||
y0i = (v2f64) __msa_insert_d((v2i64) y0i, 1, *((long long *)(y + 1 * inc_y2 + 1))); \
|
||||
|
||||
#define ZSTORE_Y4_GP() \
|
||||
*((long long *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0); \
|
||||
*((long long *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1); \
|
||||
*((long long *)(y + 2 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 0); \
|
||||
*((long long *)(y + 3 * inc_y2)) = __msa_copy_s_d((v2i64) y1r, 1); \
|
||||
*((long long *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0); \
|
||||
*((long long *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1); \
|
||||
*((long long *)(y + 2 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 0); \
|
||||
*((long long *)(y + 3 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y1i, 1); \
|
||||
|
||||
#define ZSTORE_Y2_GP() \
|
||||
*((long long *)(y + 0 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 0); \
|
||||
*((long long *)(y + 1 * inc_y2)) = __msa_copy_s_d((v2i64) y0r, 1); \
|
||||
*((long long *)(y + 0 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 0); \
|
||||
*((long long *)(y + 1 * inc_y2 + 1)) = __msa_copy_s_d((v2i64) y0i, 1); \
|
||||
|
||||
#define ZGEMV_N_MSA() \
|
||||
for (j = (n >> 2); j--;) \
|
||||
{ \
|
||||
ZLOAD_X4_SCALE() \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 2); i--;) \
|
||||
{ \
|
||||
ZLOAD_Y4() \
|
||||
ZGEMV_N_4x4() \
|
||||
ZSTORE_Y4() \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
y += inc_y2 * 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
ZLOAD_Y2() \
|
||||
ZGEMV_N_2x4() \
|
||||
ZSTORE_Y2() \
|
||||
\
|
||||
k += 2 * 2; \
|
||||
y += inc_y2 * 2; \
|
||||
} \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
temp0_r = tp4r[0]; \
|
||||
temp1_r = tp4r[1]; \
|
||||
temp2_r = tp5r[0]; \
|
||||
temp3_r = tp5r[1]; \
|
||||
\
|
||||
temp0_i = tp4i[0]; \
|
||||
temp1_i = tp4i[1]; \
|
||||
temp2_i = tp5i[0]; \
|
||||
temp3_i = tp5i[1]; \
|
||||
\
|
||||
ZGEMV_N_1x4() \
|
||||
k += 2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
\
|
||||
pa0 += 4 * lda2; \
|
||||
pa1 += 4 * lda2; \
|
||||
pa2 += 4 * lda2; \
|
||||
pa3 += 4 * lda2; \
|
||||
\
|
||||
x += 4 * inc_x2; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
ZLOAD_X2_SCALE() \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 2); i--;) \
|
||||
{ \
|
||||
ZLOAD_Y4() \
|
||||
ZGEMV_N_4x2() \
|
||||
ZSTORE_Y4() \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
y += inc_y2 * 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
ZLOAD_Y2() \
|
||||
ZGEMV_N_2x2() \
|
||||
ZSTORE_Y2() \
|
||||
\
|
||||
k += 2 * 2; \
|
||||
y += inc_y2 * 2; \
|
||||
} \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
temp0_r = tp4r[0]; \
|
||||
temp1_r = tp4r[1]; \
|
||||
\
|
||||
temp0_i = tp4i[0]; \
|
||||
temp1_i = tp4i[1]; \
|
||||
\
|
||||
ZGEMV_N_1x2() \
|
||||
\
|
||||
k += 2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
\
|
||||
pa0 += 2 * lda2; \
|
||||
pa1 += 2 * lda2; \
|
||||
\
|
||||
x += 2 * inc_x2; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
ZLOAD_X1_SCALE() \
|
||||
\
|
||||
k = 0; \
|
||||
y = y_org; \
|
||||
\
|
||||
for (i = (m >> 2); i--;) \
|
||||
{ \
|
||||
ZLOAD_Y4() \
|
||||
ZGEMV_N_4x1() \
|
||||
ZSTORE_Y4() \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
y += inc_y2 * 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
ZLOAD_Y2() \
|
||||
ZGEMV_N_2x1() \
|
||||
ZSTORE_Y2() \
|
||||
\
|
||||
k += 2 * 2; \
|
||||
y += inc_y2 * 2; \
|
||||
} \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
ZGEMV_N_1x1() \
|
||||
\
|
||||
k += 2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
\
|
||||
pa0 += lda2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
FLOAT *A, BLASLONG lda2, FLOAT *x, BLASLONG inc_x2, FLOAT *y,
|
||||
BLASLONG inc_y2, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
FLOAT *y_org = y;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3;
|
||||
FLOAT temp0_r, temp1_r, temp2_r, temp3_r, temp0_i, temp1_i, temp2_i;
|
||||
FLOAT temp3_i, res0, res1;
|
||||
v2f64 alphar, alphai;
|
||||
v2f64 x0, x1, x2, x3, y0, y1, y2, y3;
|
||||
v2f64 x0r, x1r, x0i, x1i, y0r, y1r, y0i, y1i;
|
||||
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
|
||||
v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
|
||||
v2f64 tp0r, tp1r, tp2r, tp3r, tp4r, tp5r, tp0i, tp1i, tp2i, tp3i, tp4i, tp5i;
|
||||
|
||||
lda2 = 2 * lda2;
|
||||
inc_x2 = 2 * inc_x2;
|
||||
inc_y2 = 2 * inc_y2;
|
||||
|
||||
pa0 = A;
|
||||
pa1 = A + lda2;
|
||||
pa2 = A + 2 * lda2;
|
||||
pa3 = A + 3 * lda2;
|
||||
|
||||
alphar = COPY_DOUBLE_TO_VECTOR(alpha_r);
|
||||
alphai = COPY_DOUBLE_TO_VECTOR(alpha_i);
|
||||
|
||||
if ((2 == inc_x2) && (2 == inc_y2))
|
||||
{
|
||||
#define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR
|
||||
#define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR
|
||||
#define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
|
||||
#define ZLOAD_Y4 ZLOAD_Y4_VECTOR
|
||||
#define ZLOAD_Y2 ZLOAD_Y2_VECTOR
|
||||
#define ZSTORE_Y4 ZSTORE_Y4_VECTOR
|
||||
#define ZSTORE_Y2 ZSTORE_Y2_VECTOR
|
||||
|
||||
ZGEMV_N_MSA();
|
||||
|
||||
#undef ZLOAD_X4_SCALE
|
||||
#undef ZLOAD_X2_SCALE
|
||||
#undef ZLOAD_X1_SCALE
|
||||
#undef ZLOAD_Y4
|
||||
#undef ZLOAD_Y2
|
||||
#undef ZSTORE_Y4
|
||||
#undef ZSTORE_Y2
|
||||
}
|
||||
else if (2 == inc_x2)
|
||||
{
|
||||
#define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_VECTOR
|
||||
#define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_VECTOR
|
||||
#define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
|
||||
#define ZLOAD_Y4 ZLOAD_Y4_GP
|
||||
#define ZLOAD_Y2 ZLOAD_Y2_GP
|
||||
#define ZSTORE_Y4 ZSTORE_Y4_GP
|
||||
#define ZSTORE_Y2 ZSTORE_Y2_GP
|
||||
|
||||
ZGEMV_N_MSA();
|
||||
|
||||
#undef ZLOAD_X4_SCALE
|
||||
#undef ZLOAD_X2_SCALE
|
||||
#undef ZLOAD_X1_SCALE
|
||||
#undef ZLOAD_Y4
|
||||
#undef ZLOAD_Y2
|
||||
#undef ZSTORE_Y4
|
||||
#undef ZSTORE_Y2
|
||||
}
|
||||
else if (2 == inc_y2)
|
||||
{
|
||||
#define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP
|
||||
#define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP
|
||||
#define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
|
||||
#define ZLOAD_Y4 ZLOAD_Y4_VECTOR
|
||||
#define ZLOAD_Y2 ZLOAD_Y2_VECTOR
|
||||
#define ZSTORE_Y4 ZSTORE_Y4_VECTOR
|
||||
#define ZSTORE_Y2 ZSTORE_Y2_VECTOR
|
||||
|
||||
ZGEMV_N_MSA();
|
||||
|
||||
#undef ZLOAD_X4_SCALE
|
||||
#undef ZLOAD_X2_SCALE
|
||||
#undef ZLOAD_X1_SCALE
|
||||
#undef ZLOAD_Y4
|
||||
#undef ZLOAD_Y2
|
||||
#undef ZSTORE_Y4
|
||||
#undef ZSTORE_Y2
|
||||
}
|
||||
else
|
||||
{
|
||||
#define ZLOAD_X4_SCALE ZLOAD_X4_SCALE_GP
|
||||
#define ZLOAD_X2_SCALE ZLOAD_X2_SCALE_GP
|
||||
#define ZLOAD_X1_SCALE ZLOAD_X1_SCALE_GP
|
||||
#define ZLOAD_Y4 ZLOAD_Y4_GP
|
||||
#define ZLOAD_Y2 ZLOAD_Y2_GP
|
||||
#define ZSTORE_Y4 ZSTORE_Y4_GP
|
||||
#define ZSTORE_Y2 ZSTORE_Y2_GP
|
||||
|
||||
ZGEMV_N_MSA();
|
||||
|
||||
#undef ZLOAD_X4_SCALE
|
||||
#undef ZLOAD_X2_SCALE
|
||||
#undef ZLOAD_X1_SCALE
|
||||
#undef ZLOAD_Y4
|
||||
#undef ZLOAD_Y2
|
||||
#undef ZSTORE_Y4
|
||||
#undef ZSTORE_Y2
|
||||
}
|
||||
return(0);
|
||||
}
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
#undef OP3
|
||||
#undef OP4
|
|
@ -0,0 +1,544 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2016, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include "macros_msa.h"
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
||||
#undef OP3
|
||||
#undef OP4
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
#define OP0 -=
|
||||
#define OP1 +=
|
||||
#define OP2 +=
|
||||
#else
|
||||
#define OP0 +=
|
||||
#define OP1 +=
|
||||
#define OP2 -=
|
||||
#endif
|
||||
|
||||
#define ZGEMV_T_4x4() \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
LD_DP4(pa2 + k, 2, t8, t9, t10, t11); \
|
||||
LD_DP4(pa3 + k, 2, t12, t13, t14, t15); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_D2_DP(t7, t6, src3r, src3i); \
|
||||
PCKEVOD_D2_DP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_D2_DP(t11, t10, src5r, src5i); \
|
||||
PCKEVOD_D2_DP(t13, t12, src6r, src6i); \
|
||||
PCKEVOD_D2_DP(t15, t14, src7r, src7i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r += src1r * x1r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
tp0r OP0 src1i * x1i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r += src3r * x1r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
tp1r OP0 src3i * x1i; \
|
||||
\
|
||||
tp2r += src4r * x0r; \
|
||||
tp2r += src5r * x1r; \
|
||||
tp2r OP0 src4i * x0i; \
|
||||
tp2r OP0 src5i * x1i; \
|
||||
\
|
||||
tp3r += src6r * x0r; \
|
||||
tp3r += src7r * x1r; \
|
||||
tp3r OP0 src6i * x0i; \
|
||||
tp3r OP0 src7i * x1i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP1 src1r * x1i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
tp0i OP2 src1i * x1r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP1 src3r * x1i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
tp1i OP2 src3i * x1r; \
|
||||
\
|
||||
tp2i OP1 src4r * x0i; \
|
||||
tp2i OP1 src5r * x1i; \
|
||||
tp2i OP2 src4i * x0r; \
|
||||
tp2i OP2 src5i * x1r; \
|
||||
\
|
||||
tp3i OP1 src6r * x0i; \
|
||||
tp3i OP1 src7r * x1i; \
|
||||
tp3i OP2 src6i * x0r; \
|
||||
tp3i OP2 src7i * x1r; \
|
||||
|
||||
#define ZGEMV_T_4x2() \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
LD_DP4(pa1 + k, 2, t4, t5, t6, t7); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
|
||||
PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_D2_DP(t7, t6, src3r, src3i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r += src1r * x1r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
tp0r OP0 src1i * x1i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r += src3r * x1r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
tp1r OP0 src3i * x1i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP1 src1r * x1i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
tp0i OP2 src1i * x1r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP1 src3r * x1i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
tp1i OP2 src3i * x1r; \
|
||||
|
||||
#define ZGEMV_T_4x1() \
|
||||
LD_DP4(pa0 + k, 2, t0, t1, t2, t3); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t3, t2, src1r, src1i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r += src1r * x1r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
tp0r OP0 src1i * x1i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP1 src1r * x1i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
tp0i OP2 src1i * x1r; \
|
||||
|
||||
#define ZGEMV_T_2x4() \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
LD_DP2(pa2 + k, 2, t8, t9); \
|
||||
LD_DP2(pa3 + k, 2, t12, t13); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
|
||||
PCKEVOD_D2_DP(t9, t8, src4r, src4i); \
|
||||
PCKEVOD_D2_DP(t13, t12, src6r, src6i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
\
|
||||
tp2r += src4r * x0r; \
|
||||
tp2r OP0 src4i * x0i; \
|
||||
\
|
||||
tp3r += src6r * x0r; \
|
||||
tp3r OP0 src6i * x0i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
\
|
||||
tp2i OP1 src4r * x0i; \
|
||||
tp2i OP2 src4i * x0r; \
|
||||
\
|
||||
tp3i OP1 src6r * x0i; \
|
||||
tp3i OP2 src6i * x0r; \
|
||||
|
||||
#define ZGEMV_T_2x2() \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
LD_DP2(pa1 + k, 2, t4, t5); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
PCKEVOD_D2_DP(t5, t4, src2r, src2i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
\
|
||||
tp1r += src2r * x0r; \
|
||||
tp1r OP0 src2i * x0i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
\
|
||||
tp1i OP1 src2r * x0i; \
|
||||
tp1i OP2 src2i * x0r; \
|
||||
|
||||
#define ZGEMV_T_2x1() \
|
||||
LD_DP2(pa0 + k, 2, t0, t1); \
|
||||
\
|
||||
PCKEVOD_D2_DP(t1, t0, src0r, src0i); \
|
||||
\
|
||||
tp0r += src0r * x0r; \
|
||||
tp0r OP0 src0i * x0i; \
|
||||
\
|
||||
tp0i OP1 src0r * x0i; \
|
||||
tp0i OP2 src0i * x0r; \
|
||||
|
||||
#define ZGEMV_T_1x4() \
|
||||
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
|
||||
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp1r += pa1[k + 0] * x[0 * inc_x2]; \
|
||||
temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp2r += pa2[k + 0] * x[0 * inc_x2]; \
|
||||
temp2r OP0 pa2[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp3r += pa3[k + 0] * x[0 * inc_x2]; \
|
||||
temp3r OP0 pa3[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
\
|
||||
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
|
||||
temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
|
||||
temp2i OP1 pa2[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp2i OP2 pa2[k + 1] * x[0 * inc_x2]; \
|
||||
temp3i OP1 pa3[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp3i OP2 pa3[k + 1] * x[0 * inc_x2]; \
|
||||
|
||||
#define ZGEMV_T_1x2() \
|
||||
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
|
||||
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
temp1r += pa1[k + 0] * x[0 * inc_x2]; \
|
||||
temp1r OP0 pa1[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
\
|
||||
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
|
||||
temp1i OP1 pa1[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp1i OP2 pa1[k + 1] * x[0 * inc_x2]; \
|
||||
|
||||
#define ZGEMV_T_1x1() \
|
||||
temp0r += pa0[k + 0] * x[0 * inc_x2]; \
|
||||
temp0r OP0 pa0[k + 1] * x[0 * inc_x2 + 1]; \
|
||||
\
|
||||
temp0i OP1 pa0[k + 0] * x[0 * inc_x2 + 1]; \
|
||||
temp0i OP2 pa0[k + 1] * x[0 * inc_x2]; \
|
||||
|
||||
#define ZSCALE_STORE_Y4_GP() \
|
||||
res0r = y[0 * inc_y2]; \
|
||||
res1r = y[1 * inc_y2]; \
|
||||
res2r = y[2 * inc_y2]; \
|
||||
res3r = y[3 * inc_y2]; \
|
||||
\
|
||||
res0i = y[0 * inc_y2 + 1]; \
|
||||
res1i = y[1 * inc_y2 + 1]; \
|
||||
res2i = y[2 * inc_y2 + 1]; \
|
||||
res3i = y[3 * inc_y2 + 1]; \
|
||||
\
|
||||
res0r += alphar * temp0r; \
|
||||
res0r OP0 alphai * temp0i; \
|
||||
res1r += alphar * temp1r; \
|
||||
res1r OP0 alphai * temp1i; \
|
||||
res2r += alphar * temp2r; \
|
||||
res2r OP0 alphai * temp2i; \
|
||||
res3r += alphar * temp3r; \
|
||||
res3r OP0 alphai * temp3i; \
|
||||
\
|
||||
res0i OP1 alphar * temp0i; \
|
||||
res0i OP2 alphai * temp0r; \
|
||||
res1i OP1 alphar * temp1i; \
|
||||
res1i OP2 alphai * temp1r; \
|
||||
res2i OP1 alphar * temp2i; \
|
||||
res2i OP2 alphai * temp2r; \
|
||||
res3i OP1 alphar * temp3i; \
|
||||
res3i OP2 alphai * temp3r; \
|
||||
\
|
||||
y[0 * inc_y2] = res0r; \
|
||||
y[1 * inc_y2] = res1r; \
|
||||
y[2 * inc_y2] = res2r; \
|
||||
y[3 * inc_y2] = res3r; \
|
||||
\
|
||||
y[0 * inc_y2 + 1] = res0i; \
|
||||
y[1 * inc_y2 + 1] = res1i; \
|
||||
y[2 * inc_y2 + 1] = res2i; \
|
||||
y[3 * inc_y2 + 1] = res3i; \
|
||||
|
||||
#define ZSCALE_STORE_Y2_GP() \
|
||||
res0r = y[0 * inc_y2]; \
|
||||
res1r = y[1 * inc_y2]; \
|
||||
\
|
||||
res0i = y[0 * inc_y2 + 1]; \
|
||||
res1i = y[1 * inc_y2 + 1]; \
|
||||
\
|
||||
res0r += alphar * temp0r; \
|
||||
res0r OP0 alphai * temp0i; \
|
||||
res1r += alphar * temp1r; \
|
||||
res1r OP0 alphai * temp1i; \
|
||||
\
|
||||
res0i OP1 alphar * temp0i; \
|
||||
res0i OP2 alphai * temp0r; \
|
||||
res1i OP1 alphar * temp1i; \
|
||||
res1i OP2 alphai * temp1r; \
|
||||
\
|
||||
y[0 * inc_y2] = res0r; \
|
||||
y[1 * inc_y2] = res1r; \
|
||||
\
|
||||
y[0 * inc_y2 + 1] = res0i; \
|
||||
y[1 * inc_y2 + 1] = res1i; \
|
||||
|
||||
#define ZSCALE_STORE_Y1_GP() \
|
||||
res0r = y[0 * inc_y2]; \
|
||||
res0i = y[0 * inc_y2 + 1]; \
|
||||
\
|
||||
res0r += alphar * temp0r; \
|
||||
res0r OP0 alphai * temp0i; \
|
||||
\
|
||||
res0i OP1 alphar * temp0i; \
|
||||
res0i OP2 alphai * temp0r; \
|
||||
\
|
||||
y[0 * inc_y2] = res0r; \
|
||||
y[0 * inc_y2 + 1] = res0i; \
|
||||
|
||||
#define ZLOAD_X4_VECTOR() \
|
||||
LD_DP4(x, 2, x0, x1, x2, x3); \
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i); \
|
||||
PCKEVOD_D2_DP(x3, x2, x1r, x1i); \
|
||||
|
||||
#define ZLOAD_X2_VECTOR() \
|
||||
LD_DP2(x, 2, x0, x1); \
|
||||
PCKEVOD_D2_DP(x1, x0, x0r, x0i); \
|
||||
|
||||
#define ZLOAD_X4_GP() \
|
||||
x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2))); \
|
||||
x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *) (x + 1 * inc_x2))); \
|
||||
x1r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2))); \
|
||||
x1r = (v2f64) __msa_insert_d((v2i64) x1r, 1, *((long long *) (x + 3 * inc_x2))); \
|
||||
x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *) (x + 1 * inc_x2 + 1))); \
|
||||
x1i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 2 * inc_x2 + 1))); \
|
||||
x1i = (v2f64) __msa_insert_d((v2i64) x1i, 1, *((long long *) (x + 3 * inc_x2 + 1))); \
|
||||
|
||||
#define ZLOAD_X2_GP() \
|
||||
x0r = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2))); \
|
||||
x0r = (v2f64) __msa_insert_d((v2i64) x0r, 1, *((long long *) (x + 1 * inc_x2))); \
|
||||
x0i = (v2f64) __msa_insert_d((v2i64) tp0r, 0, *((long long *) (x + 0 * inc_x2 + 1))); \
|
||||
x0i = (v2f64) __msa_insert_d((v2i64) x0i, 1, *((long long *) (x + 1 * inc_x2 + 1))); \
|
||||
|
||||
#define ZGEMV_T_MSA() \
|
||||
for (j = (n >> 2); j--;) \
|
||||
{ \
|
||||
tp0r = tp1r = tp2r = tp3r = zero; \
|
||||
tp0i = tp1i = tp2i = tp3i = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 2); i--;) \
|
||||
{ \
|
||||
ZLOAD_X4(); \
|
||||
ZGEMV_T_4x4(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
x += inc_x2 * 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
ZLOAD_X2(); \
|
||||
ZGEMV_T_2x4(); \
|
||||
\
|
||||
k += 2 * 2; \
|
||||
x += inc_x2 * 2; \
|
||||
} \
|
||||
\
|
||||
temp0r = tp0r[0] + tp0r[1]; \
|
||||
temp1r = tp1r[0] + tp1r[1]; \
|
||||
temp2r = tp2r[0] + tp2r[1]; \
|
||||
temp3r = tp3r[0] + tp3r[1]; \
|
||||
temp0i = tp0i[0] + tp0i[1]; \
|
||||
temp1i = tp1i[0] + tp1i[1]; \
|
||||
temp2i = tp2i[0] + tp2i[1]; \
|
||||
temp3i = tp3i[0] + tp3i[1]; \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
ZGEMV_T_1x4(); \
|
||||
\
|
||||
k += 2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
\
|
||||
ZSCALE_STORE_Y4_GP(); \
|
||||
\
|
||||
pa0 += 4 * lda2; \
|
||||
pa1 += 4 * lda2; \
|
||||
pa2 += 4 * lda2; \
|
||||
pa3 += 4 * lda2; \
|
||||
y += 4 * inc_y2; \
|
||||
} \
|
||||
\
|
||||
if (n & 2) \
|
||||
{ \
|
||||
tp0r = tp1r = zero; \
|
||||
tp0i = tp1i = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 2); i--;) \
|
||||
{ \
|
||||
ZLOAD_X4(); \
|
||||
ZGEMV_T_4x2(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
x += inc_x2 * 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
ZLOAD_X2(); \
|
||||
ZGEMV_T_2x2(); \
|
||||
\
|
||||
k += 2 * 2; \
|
||||
x += inc_x2 * 2; \
|
||||
} \
|
||||
\
|
||||
temp0r = tp0r[0] + tp0r[1]; \
|
||||
temp1r = tp1r[0] + tp1r[1]; \
|
||||
temp0i = tp0i[0] + tp0i[1]; \
|
||||
temp1i = tp1i[0] + tp1i[1]; \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
ZGEMV_T_1x2(); \
|
||||
\
|
||||
k += 2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
\
|
||||
ZSCALE_STORE_Y2_GP(); \
|
||||
\
|
||||
pa0 += 2 * lda2; \
|
||||
pa1 += 2 * lda2; \
|
||||
y += 2 * inc_y2; \
|
||||
} \
|
||||
\
|
||||
if (n & 1) \
|
||||
{ \
|
||||
tp0r = zero; \
|
||||
tp0i = zero; \
|
||||
\
|
||||
k = 0; \
|
||||
x = srcx_org; \
|
||||
\
|
||||
for (i = (m >> 2); i--;) \
|
||||
{ \
|
||||
ZLOAD_X4(); \
|
||||
ZGEMV_T_4x1(); \
|
||||
\
|
||||
k += 2 * 4; \
|
||||
x += inc_x2 * 4; \
|
||||
} \
|
||||
\
|
||||
if (m & 2) \
|
||||
{ \
|
||||
ZLOAD_X2(); \
|
||||
ZGEMV_T_2x1(); \
|
||||
\
|
||||
k += 2 * 2; \
|
||||
x += inc_x2 * 2; \
|
||||
} \
|
||||
\
|
||||
temp0r = tp0r[0] + tp0r[1]; \
|
||||
temp0i = tp0i[0] + tp0i[1]; \
|
||||
\
|
||||
if (m & 1) \
|
||||
{ \
|
||||
ZGEMV_T_1x1(); \
|
||||
\
|
||||
k += 2; \
|
||||
x += inc_x2; \
|
||||
} \
|
||||
\
|
||||
ZSCALE_STORE_Y1_GP(); \
|
||||
\
|
||||
pa0 += lda2; \
|
||||
y += inc_y2; \
|
||||
} \
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alphar, FLOAT alphai,
|
||||
FLOAT *A, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y,
|
||||
BLASLONG inc_y, FLOAT *buffer)
|
||||
{
|
||||
BLASLONG i, j, k;
|
||||
BLASLONG inc_x2, inc_y2, lda2;
|
||||
FLOAT *pa0, *pa1, *pa2, *pa3;
|
||||
FLOAT *srcx_org = x;
|
||||
FLOAT temp0r, temp0i, temp2r, temp2i, temp1r, temp1i, temp3r, temp3i;
|
||||
FLOAT res0r, res0i, res2r, res2i, res1r, res1i, res3r, res3i;
|
||||
v2f64 zero = {0};
|
||||
v2f64 x0, x1, x2, x3, x0r, x1r, x0i, x1i;
|
||||
v2f64 t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, t11, t12, t13, t14, t15;
|
||||
v2f64 src0r, src1r, src2r, src3r, src4r, src5r, src6r, src7r;
|
||||
v2f64 src0i, src1i, src2i, src3i, src4i, src5i, src6i, src7i;
|
||||
v2f64 tp0r, tp1r, tp2r, tp3r, tp0i, tp1i, tp2i, tp3i;
|
||||
|
||||
lda2 = 2 * lda;
|
||||
|
||||
pa0 = A;
|
||||
pa1 = A + lda2;
|
||||
pa2 = A + 2 * lda2;
|
||||
pa3 = A + 3 * lda2;
|
||||
|
||||
inc_x2 = 2 * inc_x;
|
||||
inc_y2 = 2 * inc_y;
|
||||
|
||||
if (2 == inc_x2)
|
||||
{
|
||||
#define ZLOAD_X4 ZLOAD_X4_VECTOR
|
||||
#define ZLOAD_X2 ZLOAD_X2_VECTOR
|
||||
|
||||
ZGEMV_T_MSA();
|
||||
|
||||
#undef ZLOAD_X4
|
||||
#undef ZLOAD_X2
|
||||
}
|
||||
else
|
||||
{
|
||||
#define ZLOAD_X4 ZLOAD_X4_GP
|
||||
#define ZLOAD_X2 ZLOAD_X2_GP
|
||||
|
||||
ZGEMV_T_MSA();
|
||||
|
||||
#undef ZLOAD_X4
|
||||
#undef ZLOAD_X2
|
||||
}
|
||||
|
||||
return(0);
|
||||
}
|
||||
|
||||
#undef OP0
|
||||
#undef OP1
|
||||
#undef OP2
|
|
@ -0,0 +1 @@
|
|||
include $(KERNELDIR)/../mips/KERNEL.P5600
|
4
param.h
4
param.h
|
@ -2174,7 +2174,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
#define SYMV_P 16
|
||||
#endif
|
||||
|
||||
#if defined(I6400)
|
||||
#if defined(I6400) || defined(P6600)
|
||||
#define SNUMOPT 2
|
||||
#define DNUMOPT 2
|
||||
|
||||
|
@ -2190,7 +2190,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define CGEMM_DEFAULT_UNROLL_M 8
|
||||
#define CGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
|
||||
#define ZGEMM_DEFAULT_UNROLL_M 4
|
||||
#define ZGEMM_DEFAULT_UNROLL_N 4
|
||||
|
||||
|
|
Loading…
Reference in New Issue