From 4b548857d64e6f0fb3aefbd0bd5bd4d14f2a22d7 Mon Sep 17 00:00:00 2001 From: gxw Date: Thu, 26 Nov 2020 14:59:41 +0800 Subject: [PATCH] Add msa support for loongson 1. Using core loongson3r3 and loongson3r4 for loongson 2. Add DYNAMIC_ARCH for loongson Change-Id: I1c6b54dbeca3a0cc31d1222af36a7e9bd6ab54c1 --- Makefile.system | 27 +- common_linux.h | 8 - common_mips64.h | 9 +- cpuid_mips64.c | 91 +++---- driver/others/Makefile | 8 + driver/others/blas_server.c | 2 + driver/others/dynamic_mips64.c | 230 ++++++++++++++++++ driver/others/parameter.c | 16 +- getarch.c | 24 +- kernel/Makefile | 5 + kernel/Makefile.L3 | 4 - kernel/mips/cgemm_kernel_8x4_msa.c | 4 +- kernel/mips/crot_msa.c | 6 +- kernel/mips/cscal_msa.c | 6 +- kernel/mips/dscal_msa.c | 4 +- kernel/mips/dtrsm_kernel_LN_8x4_msa.c | 38 +-- kernel/mips/dtrsm_kernel_LT_8x4_msa.c | 36 +-- kernel/mips/dtrsm_kernel_RN_8x4_msa.c | 21 +- kernel/mips/dtrsm_kernel_RT_8x4_msa.c | 21 +- kernel/mips/macros_msa.h | 8 +- kernel/mips/srot_msa.c | 6 +- kernel/mips/sscal_msa.c | 6 +- kernel/mips/zscal_msa.c | 8 +- kernel/mips64/KERNEL.LOONGSON3B | 64 ----- .../{KERNEL.LOONGSON3A => KERNEL.LOONGSON3R3} | 27 +- kernel/mips64/KERNEL.LOONGSON3R4 | 192 +++++++++++++++ kernel/setparam-ref.c | 72 ++++++ param.h | 100 ++++---- 28 files changed, 682 insertions(+), 361 deletions(-) create mode 100644 driver/others/dynamic_mips64.c delete mode 100644 kernel/mips64/KERNEL.LOONGSON3B rename kernel/mips64/{KERNEL.LOONGSON3A => KERNEL.LOONGSON3R3} (75%) create mode 100644 kernel/mips64/KERNEL.LOONGSON3R4 diff --git a/Makefile.system b/Makefile.system index c17cd3bd1..6377f66ea 100644 --- a/Makefile.system +++ b/Makefile.system @@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180 DYNAMIC_CORE += THUNDERX3T110 endif +ifeq ($(ARCH), mips64) +DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 +endif + ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC @@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32 BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 -endif - -ifeq ($(CORE), LOONGSON3B) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 +ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) +CCOMMON_OPT += -march=loongson3a +FCOMMON_OPT += -march=loongson3a endif ifeq ($(CORE), MIPS24K) @@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32 else FCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) FCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) FCOMMON_OPT += -loongson3 -static endif @@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32 else CCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) CCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) CCOMMON_OPT += -loongson3 -static endif @@ -1223,10 +1222,8 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) -ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif -endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -1342,11 +1339,9 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) -ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif -endif ifdef NO_AFFINITY ifeq ($(NO_AFFINITY), 0) diff --git a/common_linux.h b/common_linux.h index 35f3fb658..5a1c4e150 100644 --- a/common_linux.h +++ b/common_linux.h @@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 return 0; #else -#if defined (LOONGSON3B) -#if defined (__64BIT__) - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); -#else - return 0; //NULL Implementation on Loongson 3B 32bit. -#endif -#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 // unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #endif -#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_mips64.h b/common_mips64.h index a06edfe08..287459e7d 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -229,12 +229,7 @@ REALNAME: ;\ #define BUFFER_SIZE ( 32 << 21) -#if defined(LOONGSON3A) -#define PAGESIZE (16UL << 10) -#define FIXED_PAGESIZE (16UL << 10) -#endif - -#if defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif @@ -250,7 +245,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 0c19ac1e7..674b65908 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_UNKNOWN 0 -#define CPU_SICORTEX 1 -#define CPU_LOONGSON3A 2 -#define CPU_LOONGSON3B 3 -#define CPU_I6400 4 -#define CPU_P6600 5 -#define CPU_I6500 6 +#define CPU_UNKNOWN 0 +#define CPU_SICORTEX 1 +#define CPU_LOONGSON3R3 2 +#define CPU_LOONGSON3R4 3 +#define CPU_I6400 4 +#define CPU_P6600 5 +#define CPU_I6500 6 static char *cpuname[] = { "UNKNOWN", "SICORTEX", - "LOONGSON3A", - "LOONGSON3B", + "LOONGSON3R3", + "LOONGSON3R4", "I6400", "P6600", "I6500" @@ -90,48 +90,13 @@ static char *cpuname[] = { int detect(void){ -#ifdef __linux +#ifdef linux FILE *infile; char buffer[512], *p; p = (char *)NULL; - infile = fopen("/proc/cpuinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("cpu", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - }else if (strstr(p, "Loongson-3")){ - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("system type", buffer, 11)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if (strstr(p, "loongson3a")) - return CPU_LOONGSON3A; - }else{ - return CPU_SICORTEX; - } - } //Check model name for Loongson3 infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("model name", buffer, 10)){ p = strchr(buffer, ':') + 2; @@ -140,14 +105,16 @@ int detect(void){ } fclose(infile); if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - } + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ + return CPU_LOONGSON3R3; + }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ + return CPU_LOONGSON3R4; + } else{ + return CPU_SICORTEX; } #endif return CPU_UNKNOWN; + } } char *get_corename(void){ @@ -159,10 +126,10 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_LOONGSON3A) { - printf("LOONGSON3A"); - }else if(detect()==CPU_LOONGSON3B){ - printf("LOONGSON3B"); + if(detect()==CPU_LOONGSON3R3) { + printf("LOONGSON3R3"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("LOONGSON3R4"); }else if(detect()==CPU_I6400){ printf("I6400"); }else if(detect()==CPU_P6600){ @@ -179,8 +146,8 @@ void get_subdirname(void){ } void get_cpuconfig(void){ - if(detect()==CPU_LOONGSON3A) { - printf("#define LOONGSON3A\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("#define LOONGSON3R3\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -188,8 +155,8 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); - }else if(detect()==CPU_LOONGSON3B){ - printf("#define LOONGSON3B\n"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("#define LOONGSON3R4\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -237,10 +204,10 @@ void get_cpuconfig(void){ } void get_libname(void){ - if(detect()==CPU_LOONGSON3A) { - printf("loongson3a\n"); - }else if(detect()==CPU_LOONGSON3B) { - printf("loongson3b\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("loongson3r3\n"); + }else if(detect()==CPU_LOONGSON3R4) { + printf("loongson3r4\n"); }else if(detect()==CPU_I6400) { printf("i6400\n"); }else if(detect()==CPU_P6600) { diff --git a/driver/others/Makefile b/driver/others/Makefile index d09444f56..4a421ef31 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -24,10 +24,14 @@ else ifeq ($(ARCH),zarch) COMMONOBJS += dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +COMMONOBJS += dynamic_mips64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif endif endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -92,10 +96,14 @@ else ifeq ($(ARCH),zarch) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif endif endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 30e0cc6c2..5e0943c2e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; #if defined(ARCH_MIPS64) +#ifndef DYNAMIC_ARCH //set parameters for different number of threads. blas_set_parameter(); #endif +#endif } diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c new file mode 100644 index 000000000..9fd19d739 --- /dev/null +++ b/driver/others/dynamic_mips64.c @@ -0,0 +1,230 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include "common.h" + +extern gotoblas_t gotoblas_LOONGSON3R3; +extern gotoblas_t gotoblas_LOONGSON3R4; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 2 + +static char *corename[] = { + "loongson3r3", + "loongson3r4", + "UNKNOWN" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; + if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_LOONGSON3R3); + case 1: return (&gotoblas_LOONGSON3R4); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +#define MMI_MASK 0x00000010 +#define MSA_MASK 0x00000020 + +int fd[2]; +int support_cpucfg; + +static void handler(int signum) +{ + close(fd[1]); + exit(1); +} + +/* Brief : Function to check if cpucfg supported on loongson + * Return: 1 supported + * 0 not supported + */ +static int cpucfg_test(void) { + pid_t pid; + int status = 0; + + support_cpucfg = 0; + pipe(fd); + pid = fork(); + if (pid == 0) { /* Subprocess */ + struct sigaction act; + close(fd[0]); + /* Set signal action for SIGILL. */ + act.sa_handler = handler; + sigaction(SIGILL,&act,NULL); + + /* Execute cpucfg in subprocess. */ + __asm__ volatile( + ".insn \n\t" + ".word (0xc8080118) \n\t" + ::: + ); + support_cpucfg = 1; + write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); + close(fd[1]); + exit(0); + } else if (pid > 0){ /* Parent process*/ + close(fd[1]); + if ((waitpid(pid,&status,0) <= 0) || + (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) + support_cpucfg = 0; + close(fd[0]); + } else { + support_cpucfg = 0; + } + + return support_cpucfg; +} + +static gotoblas_t *get_coretype_from_cpucfg(void) { + int flag = 0; + __asm__ volatile( + ".insn \n\t" + "dli $8, 0x01 \n\t" + ".word (0xc9084918) \n\t" + "usw $9, 0x00(%0) \n\t" + : + : "r"(&flag) + : "memory" + ); + if (flag & MSA_MASK) + return (&gotoblas_LOONGSON3R4); + if (flag & MMI_MASK) + return (&gotoblas_LOONGSON3R3); + return NULL; +} + +static gotoblas_t *get_coretype_from_cpuinfo(void) { +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) + return (&gotoblas_LOONGSON3R3); + else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) + return (&gotoblas_LOONGSON3R4); + else + return NULL; + } +#endif + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int ret = 0; + + ret = cpucfg_test(); + if (ret == 1) + return get_coretype_from_cpucfg(); + else + return get_coretype_from_cpuinfo(); +} + +void gotoblas_dynamic_init(void) { + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_LOONGSON3R3; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 35fc0a253..36da13369 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -717,7 +717,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #ifdef SMP if(blas_num_threads == 1){ #endif @@ -731,20 +731,6 @@ void blas_set_parameter(void){ #endif #endif -#if defined(LOONGSON3B) -#ifdef SMP - if(blas_num_threads == 1 || blas_num_threads == 2){ -#endif - //single thread - dgemm_r = 640; -#ifdef SMP - }else{ - //multi thread - dgemm_r = 160; - } -#endif -#endif - } #endif diff --git a/getarch.c b/getarch.c index 9344defb5..e59a4e9b7 100644 --- a/getarch.c +++ b/getarch.c @@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3A */ -/* #define FORCE_LOONGSON3B */ +/* #define FORCE_LOONGSON3R3 */ +/* #define FORCE_LOONGSON3R4 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef FORCE_LOONGSON3A +#ifdef FORCE_LOONGSON3R3 #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3A" +#define SUBARCHITECTURE "LOONGSON3R3" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3A " \ +#define ARCHCONFIG "-DLOONGSON3R3 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3a" -#define CORENAME "LOONGSON3A" +#define LIBNAME "loongson3r3" +#define CORENAME "LOONGSON3R3" #else #endif -#ifdef FORCE_LOONGSON3B +#ifdef FORCE_LOONGSON3R4 #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3B" +#define SUBARCHITECTURE "LOONGSON3R4" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3B " \ +#define ARCHCONFIG "-DLOONGSON3R4 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3b" -#define CORENAME "LOONGSON3B" +#define LIBNAME "loongson3r4" +#define CORENAME "LOONGSON3R4" #else #endif diff --git a/kernel/Makefile b/kernel/Makefile index fb1d5d39a..4e86546b9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) endif else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) +else ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif @@ -68,6 +70,9 @@ else TARGET_CORE = $(CORE) KDIR = TSUFFIX = +ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += $(MSA_FLAGS) +endif endif -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 893713769..d8d739965 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64) USE_TRMM = 1 endif -ifeq ($(TARGET), LOONGSON3B) -USE_TRMM = 1 -endif - ifneq ($(DYNAMIC_ARCH), 1) ifeq ($(TARGET), GENERIC) USE_TRMM = 1 diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c index 8b624be88..aa3f1dcfa 100644 --- a/kernel/mips/cgemm_kernel_8x4_msa.c +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ @@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP2_INC(pa0, 4, src_a0, src_a1); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ diff --git a/kernel/mips/crot_msa.c b/kernel/mips/crot_msa.c index 5273e38a3..84eb54d6d 100644 --- a/kernel/mips/crot_msa.c +++ b/kernel/mips/crot_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 2 elements */ for (j = (n >> 1); j--;) diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c index 11a1450cf..451d0c921 100644 --- a/kernel/mips/cscal_msa.c +++ b/kernel/mips/cscal_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c index 6ce0375ab..2e41d8bef 100644 --- a/kernel/mips/dscal_msa.c +++ b/kernel/mips/dscal_msa.c @@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index 9fb5141ca..e2cd3aa4b 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); - src_a54 = __msa_cast_to_vector_double(*(a + 54)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); src_a62 = LD_DP(a + 62); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); @@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a44 = LD_DP(a + 44); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); - src_a36 = __msa_cast_to_vector_double(*(a + 36)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; @@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a26 = LD_DP(a + 26); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); - src_a18 = __msa_cast_to_vector_double(*(a + 18)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; @@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a8 = LD_DP(a + 8); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); res_c1 -= res_c2 * src_a17; res_c1 *= src_a9; @@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a52 = LD_DP(a - 12); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); - src_a54 = __msa_cast_to_vector_double(*(a - 10)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); src_a40 = LD_DP(a - 24); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); @@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a34 = LD_DP(a - 30); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); - src_a36 = __msa_cast_to_vector_double(*(a - 28)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a16 = LD_DP(a - 48); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); - src_a18 = __msa_cast_to_vector_double(*(a - 46)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); - src_a0 = __msa_cast_to_vector_double(*(a - 64)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); src_a8 = LD_DP(a - 56); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); @@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); @@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index 525fc8585..74cc1278a 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c8 * src_a6; res_c15 -= res_c8 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c10 * src_a22; res_c15 -= res_c10 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c12 * src_a38; res_c15 -= res_c12 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c0 * src_a6; res_c7 -= res_c0 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c2 * src_a22; res_c7 -= res_c2 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a38; res_c7 -= res_c4 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a2; res_c7 -= res_c4 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; @@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index cb361c511..03036f1c7 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) } } - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index 581a90f71..4c55a0f37 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 16; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); @@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) a -= 8; b -= 1; - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 8; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index ee0dea0b7..b887800ed 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) #define COPY_FLOAT_TO_VECTOR(a) ( { \ - v4f32 out; \ - out = __msa_cast_to_vector_float(a); \ - out = (v4f32) __msa_splati_w((v4i32) out, 0); \ + v4f32 out = {a, a, a, a}; \ out; \ } ) #define COPY_DOUBLE_TO_VECTOR(a) ( { \ - v2f64 out; \ - out = __msa_cast_to_vector_double(a); \ - out = (v2f64) __msa_splati_d((v2i64) out, 0); \ + v2f64 out = {a, a}; \ out; \ } ) diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c index 75730241a..79d921b7a 100644 --- a/kernel/mips/srot_msa.c +++ b/kernel/mips/srot_msa.c @@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 4 floats */ for (j = (n >> 2); j--;) diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c index 64b62d659..66e17b844 100644 --- a/kernel/mips/sscal_msa.c +++ b/kernel/mips/sscal_msa.c @@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 6); i--;) { diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c index 5a8766d3c..a45c3cecd 100644 --- a/kernel/mips/zscal_msa.c +++ b/kernel/mips/zscal_msa.c @@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { @@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B deleted file mode 100644 index e476c631e..000000000 --- a/kernel/mips64/KERNEL.LOONGSON3B +++ /dev/null @@ -1,64 +0,0 @@ -SAXPYKERNEL=axpy_loongson3a.S -DAXPYKERNEL=daxpy_loongson3a_simd.S - -SGEMVNKERNEL = gemv_n_loongson3a.c -SGEMVTKERNEL = gemv_t_loongson3a.c -DGEMVNKERNEL = gemv_n_loongson3a.c -DGEMVTKERNEL = gemv_t_loongson3a.c -CGEMVNKERNEL = zgemv_n_loongson3a.c -CGEMVTKERNEL = zgemv_t_loongson3a.c -ZGEMVNKERNEL = zgemv_n_loongson3a.c -ZGEMVTKERNEL = zgemv_t_loongson3a.c - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3R3 similarity index 75% rename from kernel/mips64/KERNEL.LOONGSON3A rename to kernel/mips64/KERNEL.LOONGSON3R3 index 0298faaad..904828d57 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3R3 @@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DSDOTKERNEL = ../mips/dot.c - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 new file mode 100644 index 000000000..b81e5441d --- /dev/null +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -0,0 +1,192 @@ +ifdef HAVE_MSA +SAXPYKERNEL = ../mips/saxpy_msa.c +DAXPYKERNEL = ../mips/daxpy_msa.c +CAXPYKERNEL = ../mips/caxpy_msa.c +ZAXPYKERNEL = ../mips/zaxpy_msa.c +else +SAXPYKERNEL = axpy_loongson3a.S +DAXPYKERNEL = daxpy_loongson3a_simd.S +endif + +ifdef HAVE_MSA +SCOPYKERNEL = ../mips/scopy_msa.c +DCOPYKERNEL = ../mips/dcopy_msa.c +CCOPYKERNEL = ../mips/ccopy_msa.c +ZCOPYKERNEL = ../mips/zcopy_msa.c +endif + +ifdef HAVE_MSA +SDOTKERNEL = ../mips/sdot_msa.c +DDOTKERNEL = ../mips/ddot_msa.c +CDOTKERNEL = ../mips/cdot_msa.c +ZDOTKERNEL = ../mips/zdot_msa.c +endif +DSDOTKERNEL = ../mips/dot.c + +ifdef HAVE_MSA +SROTKERNEL = ../mips/srot_msa.c +DROTKERNEL = ../mips/drot_msa.c +CROTKERNEL = ../mips/crot_msa.c +ZROTKERNEL = ../mips/zrot_msa.c +endif + +ifdef HAVE_MSA +SSCALKERNEL = ../mips/sscal_msa.c +DSCALKERNEL = ../mips/dscal_msa.c +CSCALKERNEL = ../mips/cscal_msa.c +ZSCALKERNEL = ../mips/zscal_msa.c +endif + +ifdef HAVE_MSA +SGEMVNKERNEL = ../mips/sgemv_n_msa.c +DGEMVNKERNEL = ../mips/dgemv_n_msa.c +SGEMVTKERNEL = ../mips/sgemv_t_msa.c +DGEMVTKERNEL = ../mips/dgemv_t_msa.c +CGEMVNKERNEL = ../mips/cgemv_n_msa.c +CGEMVTKERNEL = ../mips/cgemv_t_msa.c +ZGEMVNKERNEL = ../mips/zgemv_n_msa.c +ZGEMVTKERNEL = ../mips/zgemv_t_msa.c +else +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c +endif + +ifdef HAVE_MSA +SASUMKERNEL = ../mips/sasum_msa.c +DASUMKERNEL = ../mips/dasum_msa.c +CASUMKERNEL = ../mips/casum_msa.c +ZASUMKERNEL = ../mips/zasum_msa.c +endif + +ifdef HAVE_MSA +SSWAPKERNEL = ../mips/sswap_msa.c +DSWAPKERNEL = ../mips/dswap_msa.c +CSWAPKERNEL = ../mips/cswap_msa.c +ZSWAPKERNEL = ../mips/zswap_msa.c +endif + +ifdef HAVE_MSA +SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c +SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c +SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c +DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c +DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c +DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c +DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c +CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c +CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c +CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c +CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c +ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c +ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c +STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c +STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c +STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c +else +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c +DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c +DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c +DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index d0317a745..1e846a61c 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -933,6 +933,77 @@ static void init_parameter(void) { } #else // (ARCH_ARM64) +#if defined(ARCH_MIPS64) +static void init_parameter(void) { + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = 640; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; + TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; + TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; +#endif + +#if defined(USE_GEMM3M) +#ifdef CGEMM3M_DEFAULT_P + TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; +#else + TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; +#endif + +#ifdef ZGEMM3M_DEFAULT_P + TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; +#else + TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; +#endif + +#ifdef CGEMM3M_DEFAULT_Q + TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; +#endif + +#ifdef ZGEMM3M_DEFAULT_Q + TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; +#endif + +#ifdef CGEMM3M_DEFAULT_R + TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; +#else + TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; +#endif + +#ifdef ZGEMM3M_DEFAULT_R + TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; +#else + TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; +#endif + +#ifdef EXPRECISION + TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; + TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; + TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; +#endif +#endif +} +#else // (ARCH_MIPS64) #if (ARCH_POWER) static void init_parameter(void) { @@ -1780,4 +1851,5 @@ static void init_parameter(void) { } #endif //POWER #endif //ZARCH +#endif //(ARCH_MIPS64) #endif //(ARCH_ARM64) diff --git a/param.h b/param.h index a0d45c573..6946c2b41 100644 --- a/param.h +++ b/param.h @@ -2570,8 +2570,63 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3A -/*Copy from SICORTEX*/ +#if defined(LOONGSON3R4) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#ifdef HAVE_MSA +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#endif + +#define SGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 44 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 32 + +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 92 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 80 + +#define SGEMM_DEFAULT_R 640 +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R 640 +#define ZGEMM_DEFAULT_R 640 + +#define GEMM_OFFSET_A1 0x10000 +#define GEMM_OFFSET_B1 0x100000 + +#define SYMV_P 16 +#endif + +#if defined(LOONGSON3R3) +////Copy from SICORTEX #define SNUMOPT 2 #define DNUMOPT 2 @@ -2612,47 +2667,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3B -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL - -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 - -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 - -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 - -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 - -#define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 24 -#define CGEMM_DEFAULT_P 24 -#define ZGEMM_DEFAULT_P 20 - -#define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 -#define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 64 - -#define SGEMM_DEFAULT_R 512 -#define DGEMM_DEFAULT_R 512 -#define CGEMM_DEFAULT_R 512 -#define ZGEMM_DEFAULT_R 512 - -#define GEMM_OFFSET_A1 0x10000 -#define GEMM_OFFSET_B1 0x100000 - -#define SYMV_P 16 -#endif - #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2