Add msa support for loongson

1. Using core loongson3r3 and loongson3r4 for loongson
2. Add DYNAMIC_ARCH for loongson

Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1
This commit is contained in:
gxw 2020-11-26 14:59:41 +08:00
parent d67babf345
commit 4b548857d6
28 changed files with 682 additions and 361 deletions

View File

@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180
DYNAMIC_CORE += THUNDERX3T110
endif
ifeq ($(ARCH), mips64)
DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4
endif
ifeq ($(ARCH), zarch)
DYNAMIC_CORE = ZARCH_GENERIC
@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32
BINARY_DEFINED = 1
endif
ifeq ($(CORE), LOONGSON3A)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
endif
ifeq ($(CORE), LOONGSON3B)
CCOMMON_OPT += -march=mips64
FCOMMON_OPT += -march=mips64
ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4))
CCOMMON_OPT += -march=loongson3a
FCOMMON_OPT += -march=loongson3a
endif
ifeq ($(CORE), MIPS24K)
@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32
else
FCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
ifeq ($(CORE), LOONGSON3R3)
FCOMMON_OPT += -loongson3 -static
endif
ifeq ($(CORE), LOONGSON3B)
ifeq ($(CORE), LOONGSON3R4)
FCOMMON_OPT += -loongson3 -static
endif
@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32
else
CCOMMON_OPT += -n64
endif
ifeq ($(CORE), LOONGSON3A)
ifeq ($(CORE), LOONGSON3R3)
CCOMMON_OPT += -loongson3 -static
endif
ifeq ($(CORE), LOONGSON3B)
ifeq ($(CORE), LOONGSON3R4)
CCOMMON_OPT += -loongson3 -static
endif
@ -1223,10 +1222,8 @@ ifdef SMP
CCOMMON_OPT += -DSMP_SERVER
ifeq ($(ARCH), mips64)
ifneq ($(CORE), LOONGSON3B)
USE_SIMPLE_THREADED_LEVEL3 = 1
endif
endif
ifeq ($(USE_OPENMP), 1)
# USE_SIMPLE_THREADED_LEVEL3 = 1
@ -1342,11 +1339,9 @@ endif
ifneq ($(ARCH), x86_64)
ifneq ($(ARCH), x86)
ifneq ($(CORE), LOONGSON3B)
NO_AFFINITY = 1
endif
endif
endif
ifdef NO_AFFINITY
ifeq ($(NO_AFFINITY), 0)

View File

@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode,
// https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482
return 0;
#else
#if defined (LOONGSON3B)
#if defined (__64BIT__)
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#else
return 0; //NULL Implementation on Loongson 3B 32bit.
#endif
#else
//Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34
// unsigned long null_nodemask=0;
return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags);
#endif
#endif
}
static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) {

View File

@ -229,12 +229,7 @@ REALNAME: ;\
#define BUFFER_SIZE ( 32 << 21)
#if defined(LOONGSON3A)
#define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10)
#endif
#if defined(LOONGSON3B)
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
#define PAGESIZE (16UL << 10)
#define FIXED_PAGESIZE (16UL << 10)
#endif
@ -250,7 +245,7 @@ REALNAME: ;\
#define MAP_ANONYMOUS MAP_ANON
#endif
#if defined(LOONGSON3A) || defined(LOONGSON3B)
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
#define PREFETCHD_(x) ld $0, x
#define PREFETCHD(x) PREFETCHD_(x)
#else

View File

@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#define CPU_UNKNOWN 0
#define CPU_SICORTEX 1
#define CPU_LOONGSON3A 2
#define CPU_LOONGSON3B 3
#define CPU_I6400 4
#define CPU_P6600 5
#define CPU_I6500 6
#define CPU_UNKNOWN 0
#define CPU_SICORTEX 1
#define CPU_LOONGSON3R3 2
#define CPU_LOONGSON3R4 3
#define CPU_I6400 4
#define CPU_P6600 5
#define CPU_I6500 6
static char *cpuname[] = {
"UNKNOWN",
"SICORTEX",
"LOONGSON3A",
"LOONGSON3B",
"LOONGSON3R3",
"LOONGSON3R4",
"I6400",
"P6600",
"I6500"
@ -90,48 +90,13 @@ static char *cpuname[] = {
int detect(void){
#ifdef __linux
#ifdef linux
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("cpu", buffer, 3)){
p = strchr(buffer, ':') + 2;
#if 0
fprintf(stderr, "%s\n", p);
#endif
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}else if (strstr(p, "Loongson-3")){
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("system type", buffer, 11)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if (strstr(p, "loongson3a"))
return CPU_LOONGSON3A;
}else{
return CPU_SICORTEX;
}
}
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
p = (char *)NULL;
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
@ -140,14 +105,16 @@ int detect(void){
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A")){
return CPU_LOONGSON3A;
}else if(strstr(p, "Loongson-3B")){
return CPU_LOONGSON3B;
}
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){
return CPU_LOONGSON3R3;
}else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){
return CPU_LOONGSON3R4;
} else{
return CPU_SICORTEX;
}
#endif
return CPU_UNKNOWN;
}
}
char *get_corename(void){
@ -159,10 +126,10 @@ void get_architecture(void){
}
void get_subarchitecture(void){
if(detect()==CPU_LOONGSON3A) {
printf("LOONGSON3A");
}else if(detect()==CPU_LOONGSON3B){
printf("LOONGSON3B");
if(detect()==CPU_LOONGSON3R3) {
printf("LOONGSON3R3");
}else if(detect()==CPU_LOONGSON3R4){
printf("LOONGSON3R4");
}else if(detect()==CPU_I6400){
printf("I6400");
}else if(detect()==CPU_P6600){
@ -179,8 +146,8 @@ void get_subdirname(void){
}
void get_cpuconfig(void){
if(detect()==CPU_LOONGSON3A) {
printf("#define LOONGSON3A\n");
if(detect()==CPU_LOONGSON3R3) {
printf("#define LOONGSON3R3\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
@ -188,8 +155,8 @@ void get_cpuconfig(void){
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 4\n");
}else if(detect()==CPU_LOONGSON3B){
printf("#define LOONGSON3B\n");
}else if(detect()==CPU_LOONGSON3R4){
printf("#define LOONGSON3R4\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 32\n");
printf("#define L2_SIZE 512488\n");
@ -237,10 +204,10 @@ void get_cpuconfig(void){
}
void get_libname(void){
if(detect()==CPU_LOONGSON3A) {
printf("loongson3a\n");
}else if(detect()==CPU_LOONGSON3B) {
printf("loongson3b\n");
if(detect()==CPU_LOONGSON3R3) {
printf("loongson3r3\n");
}else if(detect()==CPU_LOONGSON3R4) {
printf("loongson3r4\n");
}else if(detect()==CPU_I6400) {
printf("i6400\n");
}else if(detect()==CPU_P6600) {

View File

@ -24,10 +24,14 @@ else
ifeq ($(ARCH),zarch)
COMMONOBJS += dynamic_zarch.$(SUFFIX)
else
ifeq ($(ARCH),mips64)
COMMONOBJS += dynamic_mips64.$(SUFFIX)
else
COMMONOBJS += dynamic.$(SUFFIX)
endif
endif
endif
endif
else
COMMONOBJS += parameter.$(SUFFIX)
endif
@ -92,10 +96,14 @@ else
ifeq ($(ARCH),zarch)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX)
else
ifeq ($(ARCH),mips64)
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX)
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
endif
endif
endif
endif
else
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
endif

View File

@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) {
blas_cpu_number = num_threads;
#if defined(ARCH_MIPS64)
#ifndef DYNAMIC_ARCH
//set parameters for different number of threads.
blas_set_parameter();
#endif
#endif
}

View File

@ -0,0 +1,230 @@
/*****************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <sys/wait.h>
#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/resource.h>
#include "common.h"
extern gotoblas_t gotoblas_LOONGSON3R3;
extern gotoblas_t gotoblas_LOONGSON3R4;
extern void openblas_warning(int verbose, const char * msg);
#define NUM_CORETYPES 2
static char *corename[] = {
"loongson3r3",
"loongson3r4",
"UNKNOWN"
};
char *gotoblas_corename(void) {
if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0];
if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1];
return corename[NUM_CORETYPES];
}
static gotoblas_t *force_coretype(char *coretype) {
int i;
int found = -1;
char message[128];
for ( i=0 ; i < NUM_CORETYPES; i++)
{
if (!strncasecmp(coretype, corename[i], 20))
{
found = i;
break;
}
}
switch (found)
{
case 0: return (&gotoblas_LOONGSON3R3);
case 1: return (&gotoblas_LOONGSON3R4);
}
snprintf(message, 128, "Core not found: %s\n", coretype);
openblas_warning(1, message);
return NULL;
}
#define MMI_MASK 0x00000010
#define MSA_MASK 0x00000020
int fd[2];
int support_cpucfg;
static void handler(int signum)
{
close(fd[1]);
exit(1);
}
/* Brief : Function to check if cpucfg supported on loongson
* Return: 1 supported
* 0 not supported
*/
static int cpucfg_test(void) {
pid_t pid;
int status = 0;
support_cpucfg = 0;
pipe(fd);
pid = fork();
if (pid == 0) { /* Subprocess */
struct sigaction act;
close(fd[0]);
/* Set signal action for SIGILL. */
act.sa_handler = handler;
sigaction(SIGILL,&act,NULL);
/* Execute cpucfg in subprocess. */
__asm__ volatile(
".insn \n\t"
".word (0xc8080118) \n\t"
:::
);
support_cpucfg = 1;
write(fd[1],&support_cpucfg,sizeof(support_cpucfg));
close(fd[1]);
exit(0);
} else if (pid > 0){ /* Parent process*/
close(fd[1]);
if ((waitpid(pid,&status,0) <= 0) ||
(read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0))
support_cpucfg = 0;
close(fd[0]);
} else {
support_cpucfg = 0;
}
return support_cpucfg;
}
static gotoblas_t *get_coretype_from_cpucfg(void) {
int flag = 0;
__asm__ volatile(
".insn \n\t"
"dli $8, 0x01 \n\t"
".word (0xc9084918) \n\t"
"usw $9, 0x00(%0) \n\t"
:
: "r"(&flag)
: "memory"
);
if (flag & MSA_MASK)
return (&gotoblas_LOONGSON3R4);
if (flag & MMI_MASK)
return (&gotoblas_LOONGSON3R3);
return NULL;
}
static gotoblas_t *get_coretype_from_cpuinfo(void) {
#ifdef linux
FILE *infile;
char buffer[512], *p;
p = (char *)NULL;
//Check model name for Loongson3
infile = fopen("/proc/cpuinfo", "r");
while (fgets(buffer, sizeof(buffer), infile)){
if (!strncmp("model name", buffer, 10)){
p = strchr(buffer, ':') + 2;
break;
}
}
fclose(infile);
if(p != NULL){
if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000"))
return (&gotoblas_LOONGSON3R3);
else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000"))
return (&gotoblas_LOONGSON3R4);
else
return NULL;
}
#endif
return NULL;
}
static gotoblas_t *get_coretype(void) {
int ret = 0;
ret = cpucfg_test();
if (ret == 1)
return get_coretype_from_cpucfg();
else
return get_coretype_from_cpuinfo();
}
void gotoblas_dynamic_init(void) {
char coremsg[128];
char coren[22];
char *p;
if (gotoblas) return;
p = getenv("OPENBLAS_CORETYPE");
if ( p )
{
gotoblas = force_coretype(p);
}
else
{
gotoblas = get_coretype();
}
if (gotoblas == NULL)
{
snprintf(coremsg, 128, "Falling back to loongson3r3 core\n");
openblas_warning(1, coremsg);
gotoblas = &gotoblas_LOONGSON3R3;
}
if (gotoblas && gotoblas->init) {
strncpy(coren, gotoblas_corename(), 20);
sprintf(coremsg, "Core: %s\n", coren);
openblas_warning(2, coremsg);
gotoblas -> init();
} else {
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
exit(1);
}
}
void gotoblas_dynamic_quit(void) {
gotoblas = NULL;
}

View File

@ -717,7 +717,7 @@ void blas_set_parameter(void){
#if defined(ARCH_MIPS64)
void blas_set_parameter(void){
#if defined(LOONGSON3A)
#if defined(LOONGSON3R3) || defined(LOONGSON3R4)
#ifdef SMP
if(blas_num_threads == 1){
#endif
@ -731,20 +731,6 @@ void blas_set_parameter(void){
#endif
#endif
#if defined(LOONGSON3B)
#ifdef SMP
if(blas_num_threads == 1 || blas_num_threads == 2){
#endif
//single thread
dgemm_r = 640;
#ifdef SMP
}else{
//multi thread
dgemm_r = 160;
}
#endif
#endif
}
#endif

View File

@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_PPC440FP2 */
/* #define FORCE_CELL */
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3A */
/* #define FORCE_LOONGSON3B */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
/* #define FORCE_I6400 */
/* #define FORCE_P6600 */
/* #define FORCE_P5600 */
@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#endif
#ifdef FORCE_LOONGSON3A
#ifdef FORCE_LOONGSON3R3
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "LOONGSON3A"
#define SUBARCHITECTURE "LOONGSON3R3"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DLOONGSON3A " \
#define ARCHCONFIG "-DLOONGSON3R3 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "loongson3a"
#define CORENAME "LOONGSON3A"
#define LIBNAME "loongson3r3"
#define CORENAME "LOONGSON3R3"
#else
#endif
#ifdef FORCE_LOONGSON3B
#ifdef FORCE_LOONGSON3R4
#define FORCE
#define ARCHITECTURE "MIPS"
#define SUBARCHITECTURE "LOONGSON3B"
#define SUBARCHITECTURE "LOONGSON3R4"
#define SUBDIRNAME "mips64"
#define ARCHCONFIG "-DLOONGSON3B " \
#define ARCHCONFIG "-DLOONGSON3R4 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
"-DL2_SIZE=512488 -DL2_LINESIZE=32 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
#define LIBNAME "loongson3b"
#define CORENAME "LOONGSON3B"
#define LIBNAME "loongson3r4"
#define CORENAME "LOONGSON3R4"
#else
#endif

View File

@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX)
endif
else ifeq ($(TARGET_CORE), HASWELL)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT)
else ifeq ($(TARGET_CORE), LOONGSON3R4)
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS)
else
override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE)
endif
@ -68,6 +70,9 @@ else
TARGET_CORE = $(CORE)
KDIR =
TSUFFIX =
ifeq ($(TARGET_CORE), LOONGSON3R4)
override CFLAGS += $(MSA_FLAGS)
endif
endif
-include $(KERNELDIR)/KERNEL.$(TARGET_CORE)

View File

@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64)
USE_TRMM = 1
endif
ifeq ($(TARGET), LOONGSON3B)
USE_TRMM = 1
endif
ifneq ($(DYNAMIC_ARCH), 1)
ifeq ($(TARGET), GENERIC)
USE_TRMM = 1

View File

@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \
{ \
LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \
src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \
SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
\
PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \
@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \
{ \
LD_SP2_INC(pa0, 4, src_a0, src_a1); \
src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \
src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \
SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \
\
PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \

View File

@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{
if ((0 == c) && (0 == s))
{
v4f32 zero = __msa_cast_to_vector_float(0);
zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
v4f32 zero = {0.0, 0.0, 0.0, 0.0};
/* process 2 elements */
for (j = (n >> 1); j--;)

View File

@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
{
if ((0.0 == da_r) && (0.0 == da_i))
{
v4f32 zero_v = __msa_cast_to_vector_float(0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
for (i = (n >> 5); i--;)
{

View File

@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
{
if (0.0 == da)
{
v2f64 zero_v = __msa_cast_to_vector_double(0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
v2f64 zero_v = {0.0, 0.0};
for (i = (n >> 5); i--;)
{

View File

@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13);
ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15);
src_a54 = __msa_cast_to_vector_double(*(a + 54));
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54));
src_a62 = LD_DP(a + 62);
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1);
src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0);
@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
src_a44 = LD_DP(a + 44);
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1);
src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0);
src_a36 = __msa_cast_to_vector_double(*(a + 36));
src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36));
res_c7 *= src_a63;
res_c6 -= res_c7 * src_a62;
@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
src_a26 = LD_DP(a + 26);
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1);
src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0);
src_a18 = __msa_cast_to_vector_double(*(a + 18));
src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18));
res_c3 -= res_c7 * src_a59;
res_c2 -= res_c7 * src_a58;
@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
src_a8 = LD_DP(a + 8);
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
src_a0 = __msa_cast_to_vector_double(*(a + 0));
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
res_c1 -= res_c2 * src_a17;
res_c1 *= src_a9;
@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_a52 = LD_DP(a - 12);
src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1);
src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0);
src_a54 = __msa_cast_to_vector_double(*(a - 10));
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10));
src_a40 = LD_DP(a - 24);
src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1);
@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_a34 = LD_DP(a - 30);
src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1);
src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0);
src_a36 = __msa_cast_to_vector_double(*(a - 28));
src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0);
src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28));
res_c4 *= src_a36;
res_c3 -= res_c4 * src_a35;
@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_a16 = LD_DP(a - 48);
src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1);
src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0);
src_a18 = __msa_cast_to_vector_double(*(a - 46));
src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0);
src_a0 = __msa_cast_to_vector_double(*(a - 64));
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46));
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64));
src_a8 = LD_DP(a - 56);
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1);
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
src_a8 = __msa_cast_to_vector_double(*(a + 8));
src_a0 = __msa_cast_to_vector_double(*(a + 0));
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
src_a4 = LD_DP(a + 4);
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);
@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1);
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
src_a8 = __msa_cast_to_vector_double(*(a + 8));
src_a0 = __msa_cast_to_vector_double(*(a + 0));
src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0);
src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0);
src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8));
src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0));
src_a4 = LD_DP(a + 4);
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1);

View File

@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
res_c14 -= res_c8 * src_a6;
res_c15 -= res_c8 * src_a7;
src_a9 = __msa_cast_to_vector_double(*(a + 9));
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
src_a10 = LD_DP(a + 10);
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
res_c14 -= res_c10 * src_a22;
res_c15 -= res_c10 * src_a23;
src_a27 = __msa_cast_to_vector_double(*(a + 27));
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
src_a28 = LD_DP(a + 28);
src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
res_c14 -= res_c12 * src_a38;
res_c15 -= res_c12 * src_a39;
src_a45 = __msa_cast_to_vector_double(*(a + 45));
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
src_a46 = LD_DP(a + 46);
src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6);
ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14);
src_a63 = __msa_cast_to_vector_double(*(a + 63));
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
src_a54 = LD_DP(a + 54);
src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
res_c6 -= res_c0 * src_a6;
res_c7 -= res_c0 * src_a7;
src_a9 = __msa_cast_to_vector_double(*(a + 9));
src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0);
src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9));
src_a10 = LD_DP(a + 10);
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
res_c6 -= res_c2 * src_a22;
res_c7 -= res_c2 * src_a23;
src_a27 = __msa_cast_to_vector_double(*(a + 27));
src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0);
src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27));
src_a28 = LD_DP(a + 28);
src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1);
src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0);
@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
res_c6 -= res_c4 * src_a38;
res_c7 -= res_c4 * src_a39;
src_a45 = __msa_cast_to_vector_double(*(a + 45));
src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0);
src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45));
src_a46 = LD_DP(a + 46);
src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1);
src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0);
@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
res_c6 -= res_c5 * src_a46;
res_c7 -= res_c5 * src_a47;
src_a63 = __msa_cast_to_vector_double(*(a + 63));
src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0);
src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63));
src_a54 = LD_DP(a + 54);
src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1);
src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0);
@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
res_c6 -= res_c4 * src_a2;
res_c7 -= res_c4 * src_a3;
src_a5 = __msa_cast_to_vector_double(*(a + 5));
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
src_a6 = LD_DP(a + 6);
src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_a10 = LD_DP(a + 10);
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
src_a15 = __msa_cast_to_vector_double(*(a + 15));
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
res_c2 *= src_a10;
res_c3 -= res_c2 * src_a11;
@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
res_c2 -= res_c0 * src_a2;
res_c3 -= res_c0 * src_a3;
src_a5 = __msa_cast_to_vector_double(*(a + 5));
src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0);
src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5));
src_a6 = LD_DP(a + 6);
src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1);
src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0);
@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_a10 = LD_DP(a + 10);
src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1);
src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0);
src_a15 = __msa_cast_to_vector_double(*(a + 15));
src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0);
src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15));
res_c2 *= src_a10;
res_c3 -= res_c2 * src_a11;

View File

@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
src_b2 = LD_DP(b + 2);
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
src_b5 = __msa_cast_to_vector_double(*(b + 5));
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
src_b6 = LD_DP(b + 6);
src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
src_b10 = LD_DP(b + 10);
src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
src_b15 = __msa_cast_to_vector_double(*(b + 15));
src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
src_c0 *= src_b0;
src_c1 *= src_b0;
@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_b0 = LD_DP(b + 0);
src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b3 = __msa_cast_to_vector_double(*(b + 3));
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
src_c0 *= src_b0;
src_c1 *= src_b0;
@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
}
}
src_b0 = __msa_cast_to_vector_double(*b);
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
src_c0 *= src_b0;
src_c1 *= src_b0;
@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_b2 = LD_DP(b + 2);
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
src_b5 = __msa_cast_to_vector_double(*(b + 5));
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0);
src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5));
src_b6 = LD_DP(b + 6);
src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1);
src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0);
src_b10 = LD_DP(b + 10);
src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1);
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
src_b15 = __msa_cast_to_vector_double(*(b + 15));
src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0);
src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15));
src_c0 *= src_b0;
src_c1 *= src_b0;
@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_b0 = LD_DP(b + 0);
src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1);
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b3 = __msa_cast_to_vector_double(*(b + 3));
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0);
src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3));
src_c0 *= src_b0;
src_c1 *= src_b0;

View File

@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk)
src_b8 = LD_DP(b + 8);
src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
src_b10 = __msa_cast_to_vector_double(*(b + 10));
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
src_b0 = __msa_cast_to_vector_double(*(b + 0));
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
src_b4 = LD_DP(b + 4);
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
a -= 16;
b -= 4;
src_b0 = __msa_cast_to_vector_double(*(b + 0));
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
src_b2 = LD_DP(b + 2);
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);
@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk)
a -= 8;
b -= 1;
src_b0 = __msa_cast_to_vector_double(*b);
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b0 = COPY_DOUBLE_TO_VECTOR(*b);
src_c0 *= src_b0;
src_c1 *= src_b0;
@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
src_b8 = LD_DP(b + 8);
src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1);
src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0);
src_b10 = __msa_cast_to_vector_double(*(b + 10));
src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0);
src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10));
src_b0 = __msa_cast_to_vector_double(*(b + 0));
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
src_b4 = LD_DP(b + 4);
src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1);
src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0);
@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO
a -= 8;
b -= 4;
src_b0 = __msa_cast_to_vector_double(*(b + 0));
src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0);
src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0));
src_b2 = LD_DP(b + 2);
src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1);
src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0);

View File

@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src)
#define ST_DP(...) ST_D(v2f64, __VA_ARGS__)
#define COPY_FLOAT_TO_VECTOR(a) ( { \
v4f32 out; \
out = __msa_cast_to_vector_float(a); \
out = (v4f32) __msa_splati_w((v4i32) out, 0); \
v4f32 out = {a, a, a, a}; \
out; \
} )
#define COPY_DOUBLE_TO_VECTOR(a) ( { \
v2f64 out; \
out = __msa_cast_to_vector_double(a); \
out = (v2f64) __msa_splati_d((v2i64) out, 0); \
v2f64 out = {a, a}; \
out; \
} )

View File

@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y,
{
if ((0 == c) && (0 == s))
{
v4f32 zero = __msa_cast_to_vector_float(0);
zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0);
zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0);
zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0);
zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0);
v4f32 zero = {0.0, 0.0, 0.0, 0.0};
/* process 4 floats */
for (j = (n >> 2); j--;)

View File

@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x,
{
if (0.0 == da)
{
v4f32 zero_v = __msa_cast_to_vector_float(0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0);
zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0);
v4f32 zero_v = {0.0, 0.0, 0.0, 0.0};
for (i = (n >> 6); i--;)
{

View File

@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
{
if ((0.0 == da_r) && (0.0 == da_i))
{
v2f64 zero_v = __msa_cast_to_vector_double(0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
v2f64 zero_v = {0.0, 0.0};
for (i = (n >> 4); i--;)
{
@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
if ((0.0 == da_r) && (0.0 == da_i))
{
v2f64 zero_v = __msa_cast_to_vector_double(0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0);
zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0);
v2f64 zero_v = {0.0, 0.0};
for (i = (n >> 4); i--;)
{

View File

@ -1,64 +0,0 @@
SAXPYKERNEL=axpy_loongson3a.S
DAXPYKERNEL=daxpy_loongson3a_simd.S
SGEMVNKERNEL = gemv_n_loongson3a.c
SGEMVTKERNEL = gemv_t_loongson3a.c
DGEMVNKERNEL = gemv_n_loongson3a.c
DGEMVTKERNEL = gemv_t_loongson3a.c
CGEMVNKERNEL = zgemv_n_loongson3a.c
CGEMVTKERNEL = zgemv_t_loongson3a.c
ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c

View File

@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DSDOTKERNEL = ../mips/dot.c

View File

@ -0,0 +1,192 @@
ifdef HAVE_MSA
SAXPYKERNEL = ../mips/saxpy_msa.c
DAXPYKERNEL = ../mips/daxpy_msa.c
CAXPYKERNEL = ../mips/caxpy_msa.c
ZAXPYKERNEL = ../mips/zaxpy_msa.c
else
SAXPYKERNEL = axpy_loongson3a.S
DAXPYKERNEL = daxpy_loongson3a_simd.S
endif
ifdef HAVE_MSA
SCOPYKERNEL = ../mips/scopy_msa.c
DCOPYKERNEL = ../mips/dcopy_msa.c
CCOPYKERNEL = ../mips/ccopy_msa.c
ZCOPYKERNEL = ../mips/zcopy_msa.c
endif
ifdef HAVE_MSA
SDOTKERNEL = ../mips/sdot_msa.c
DDOTKERNEL = ../mips/ddot_msa.c
CDOTKERNEL = ../mips/cdot_msa.c
ZDOTKERNEL = ../mips/zdot_msa.c
endif
DSDOTKERNEL = ../mips/dot.c
ifdef HAVE_MSA
SROTKERNEL = ../mips/srot_msa.c
DROTKERNEL = ../mips/drot_msa.c
CROTKERNEL = ../mips/crot_msa.c
ZROTKERNEL = ../mips/zrot_msa.c
endif
ifdef HAVE_MSA
SSCALKERNEL = ../mips/sscal_msa.c
DSCALKERNEL = ../mips/dscal_msa.c
CSCALKERNEL = ../mips/cscal_msa.c
ZSCALKERNEL = ../mips/zscal_msa.c
endif
ifdef HAVE_MSA
SGEMVNKERNEL = ../mips/sgemv_n_msa.c
DGEMVNKERNEL = ../mips/dgemv_n_msa.c
SGEMVTKERNEL = ../mips/sgemv_t_msa.c
DGEMVTKERNEL = ../mips/dgemv_t_msa.c
CGEMVNKERNEL = ../mips/cgemv_n_msa.c
CGEMVTKERNEL = ../mips/cgemv_t_msa.c
ZGEMVNKERNEL = ../mips/zgemv_n_msa.c
ZGEMVTKERNEL = ../mips/zgemv_t_msa.c
else
SGEMVNKERNEL = gemv_n_loongson3a.c
SGEMVTKERNEL = gemv_t_loongson3a.c
DGEMVNKERNEL = gemv_n_loongson3a.c
DGEMVTKERNEL = gemv_t_loongson3a.c
CGEMVNKERNEL = zgemv_n_loongson3a.c
CGEMVTKERNEL = zgemv_t_loongson3a.c
ZGEMVNKERNEL = zgemv_n_loongson3a.c
ZGEMVTKERNEL = zgemv_t_loongson3a.c
endif
ifdef HAVE_MSA
SASUMKERNEL = ../mips/sasum_msa.c
DASUMKERNEL = ../mips/dasum_msa.c
CASUMKERNEL = ../mips/casum_msa.c
ZASUMKERNEL = ../mips/zasum_msa.c
endif
ifdef HAVE_MSA
SSWAPKERNEL = ../mips/sswap_msa.c
DSWAPKERNEL = ../mips/dswap_msa.c
CSWAPKERNEL = ../mips/cswap_msa.c
ZSWAPKERNEL = ../mips/zswap_msa.c
endif
ifdef HAVE_MSA
SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c
SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c
SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
else
SGEMMKERNEL = sgemm_kernel_8x4_ps.S
SGEMMINCOPY = ../generic/gemm_ncopy_8.c
SGEMMITCOPY = ../generic/gemm_tcopy_8.c
SGEMMONCOPY = ../generic/gemm_ncopy_4.c
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX)
SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX)
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
ifdef HAVE_MSA
DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c
DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c
DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c
DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c
DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c
DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
else
DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S
DGEMMONCOPY = ../generic/gemm_ncopy_4.c
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
ifdef HAVE_MSA
CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c
CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c
CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c
CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c
CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
else
CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S
CGEMMINCOPY = ../generic/zgemm_ncopy_4.c
CGEMMITCOPY = ../generic/zgemm_tcopy_4.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
ifdef HAVE_MSA
ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c
ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c
ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
else
ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
endif
ifdef HAVE_MSA
STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c
STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c
STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c
STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c
else
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
ifdef HAVE_MSA
DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c
DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c
DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c
DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c
else
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
ifdef HAVE_MSA
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
else
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif
ifdef HAVE_MSA
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
else
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
endif

View File

@ -933,6 +933,77 @@ static void init_parameter(void) {
}
#else // (ARCH_ARM64)
#if defined(ARCH_MIPS64)
static void init_parameter(void) {
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
TABLE_NAME.dgemm_r = 640;
TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
#ifdef EXPRECISION
TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P;
TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P;
TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q;
TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q;
TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R;
TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R;
#endif
#if defined(USE_GEMM3M)
#ifdef CGEMM3M_DEFAULT_P
TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P;
#else
TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p;
#endif
#ifdef ZGEMM3M_DEFAULT_P
TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P;
#else
TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p;
#endif
#ifdef CGEMM3M_DEFAULT_Q
TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q;
#else
TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q;
#endif
#ifdef ZGEMM3M_DEFAULT_Q
TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q;
#else
TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q;
#endif
#ifdef CGEMM3M_DEFAULT_R
TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R;
#else
TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r;
#endif
#ifdef ZGEMM3M_DEFAULT_R
TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R;
#else
TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r;
#endif
#ifdef EXPRECISION
TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p;
TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q;
TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r;
#endif
#endif
}
#else // (ARCH_MIPS64)
#if (ARCH_POWER)
static void init_parameter(void) {
@ -1780,4 +1851,5 @@ static void init_parameter(void) {
}
#endif //POWER
#endif //ZARCH
#endif //(ARCH_MIPS64)
#endif //(ARCH_ARM64)

100
param.h
View File

@ -2570,8 +2570,63 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif
#ifdef LOONGSON3A
/*Copy from SICORTEX*/
#if defined(LOONGSON3R4)
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#ifdef HAVE_MSA
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_M 8
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 8
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_M 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#else
#define SGEMM_DEFAULT_UNROLL_M 8
#define SGEMM_DEFAULT_UNROLL_N 4
#define DGEMM_DEFAULT_UNROLL_M 4
#define DGEMM_DEFAULT_UNROLL_N 4
#define CGEMM_DEFAULT_UNROLL_M 4
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#endif
#define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_P 44
#define CGEMM_DEFAULT_P 64
#define ZGEMM_DEFAULT_P 32
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 92
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 80
#define SGEMM_DEFAULT_R 640
#define DGEMM_DEFAULT_R dgemm_r
#define CGEMM_DEFAULT_R 640
#define ZGEMM_DEFAULT_R 640
#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000
#define SYMV_P 16
#endif
#if defined(LOONGSON3R3)
////Copy from SICORTEX
#define SNUMOPT 2
#define DNUMOPT 2
@ -2612,47 +2667,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif
#ifdef LOONGSON3B
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x03fffUL
#define SGEMM_DEFAULT_UNROLL_M 2
#define SGEMM_DEFAULT_UNROLL_N 2
#define DGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_N 2
#define ZGEMM_DEFAULT_UNROLL_M 2
#define ZGEMM_DEFAULT_UNROLL_N 2
#define SGEMM_DEFAULT_P 64
#define DGEMM_DEFAULT_P 24
#define CGEMM_DEFAULT_P 24
#define ZGEMM_DEFAULT_P 20
#define SGEMM_DEFAULT_Q 192
#define DGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 64
#define SGEMM_DEFAULT_R 512
#define DGEMM_DEFAULT_R 512
#define CGEMM_DEFAULT_R 512
#define ZGEMM_DEFAULT_R 512
#define GEMM_OFFSET_A1 0x10000
#define GEMM_OFFSET_B1 0x100000
#define SYMV_P 16
#endif
#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500)
#define SNUMOPT 2
#define DNUMOPT 2