diff --git a/Makefile.system b/Makefile.system index c17cd3bd1..5adde36d8 100644 --- a/Makefile.system +++ b/Makefile.system @@ -625,6 +625,10 @@ DYNAMIC_CORE += EMAG8180 DYNAMIC_CORE += THUNDERX3T110 endif +ifeq ($(ARCH), mips64) +DYNAMIC_CORE = LOONGSON3R3 LOONGSON3R4 +endif + ifeq ($(ARCH), zarch) DYNAMIC_CORE = ZARCH_GENERIC @@ -672,7 +676,7 @@ DYNAMIC_CORE += POWER9 else $(info, OpenBLAS: Your gcc version is too old to build the POWER9 kernels.) endif -LDVERSIONGTEQ35 := $(shell expr `ld --version | head -1 | cut -f2 -d "." | cut -f1 -d "-"` >= 35) +LDVERSIONGTEQ35 := $(shell expr `$(CC) -Wl,--version 2> /dev/null | head -1 | cut -f2 -d "." | cut -f1 -d "-"` \>= 35) ifeq ($(GCCVERSIONGTEQ11)$(LDVERSIONGTEQ35), 11) DYNAMIC_CORE += POWER10 CCOMMON_OPT += -DHAVE_P10_SUPPORT @@ -787,14 +791,9 @@ CCOMMON_OPT += -mabi=32 BINARY_DEFINED = 1 endif -ifeq ($(CORE), LOONGSON3A) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 -endif - -ifeq ($(CORE), LOONGSON3B) -CCOMMON_OPT += -march=mips64 -FCOMMON_OPT += -march=mips64 +ifeq ($(CORE), $(filter $(CORE),LOONGSON3R3 LOONGSON3R4)) +CCOMMON_OPT += -march=loongson3a +FCOMMON_OPT += -march=loongson3a endif ifeq ($(CORE), MIPS24K) @@ -1078,11 +1077,11 @@ FCOMMON_OPT += -n32 else FCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) FCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) FCOMMON_OPT += -loongson3 -static endif @@ -1108,11 +1107,11 @@ CCOMMON_OPT += -n32 else CCOMMON_OPT += -n64 endif -ifeq ($(CORE), LOONGSON3A) +ifeq ($(CORE), LOONGSON3R3) CCOMMON_OPT += -loongson3 -static endif -ifeq ($(CORE), LOONGSON3B) +ifeq ($(CORE), LOONGSON3R4) CCOMMON_OPT += -loongson3 -static endif @@ -1223,10 +1222,8 @@ ifdef SMP CCOMMON_OPT += -DSMP_SERVER ifeq ($(ARCH), mips64) -ifneq ($(CORE), LOONGSON3B) USE_SIMPLE_THREADED_LEVEL3 = 1 endif -endif ifeq ($(USE_OPENMP), 1) # USE_SIMPLE_THREADED_LEVEL3 = 1 @@ -1342,11 +1339,9 @@ endif ifneq ($(ARCH), x86_64) ifneq ($(ARCH), x86) -ifneq ($(CORE), LOONGSON3B) NO_AFFINITY = 1 endif endif -endif ifdef NO_AFFINITY ifeq ($(NO_AFFINITY), 0) diff --git a/c_check b/c_check index fe9c53f0e..970d475d7 100644 --- a/c_check +++ b/c_check @@ -199,7 +199,7 @@ if (($architecture eq "mips") || ($architecture eq "mips64")) { } else { $tmpf = new File::Temp( SUFFIX => '.c' , UNLINK => 1 ); $code = '"addvi.b $w0, $w1, 1"'; - $msa_flags = "-mmsa -mfp64 -msched-weight -mload-store-pairs"; + $msa_flags = "-mmsa -mfp64 -mload-store-pairs"; print $tmpf "#include \n\n"; print $tmpf "void main(void){ __asm__ volatile($code); }\n"; diff --git a/common_linux.h b/common_linux.h index 35f3fb658..5a1c4e150 100644 --- a/common_linux.h +++ b/common_linux.h @@ -75,18 +75,10 @@ static inline int my_mbind(void *addr, unsigned long len, int mode, // https://lsbbugs.linuxfoundation.org/show_bug.cgi?id=3482 return 0; #else -#if defined (LOONGSON3B) -#if defined (__64BIT__) - return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); -#else - return 0; //NULL Implementation on Loongson 3B 32bit. -#endif -#else //Fixed randomly SEGFAULT when nodemask==NULL with above Linux 2.6.34 // unsigned long null_nodemask=0; return syscall(SYS_mbind, addr, len, mode, nodemask, maxnode, flags); #endif -#endif } static inline int my_set_mempolicy(int mode, const unsigned long *addr, unsigned long flag) { diff --git a/common_mips64.h b/common_mips64.h index a06edfe08..287459e7d 100644 --- a/common_mips64.h +++ b/common_mips64.h @@ -229,12 +229,7 @@ REALNAME: ;\ #define BUFFER_SIZE ( 32 << 21) -#if defined(LOONGSON3A) -#define PAGESIZE (16UL << 10) -#define FIXED_PAGESIZE (16UL << 10) -#endif - -#if defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PAGESIZE (16UL << 10) #define FIXED_PAGESIZE (16UL << 10) #endif @@ -250,7 +245,7 @@ REALNAME: ;\ #define MAP_ANONYMOUS MAP_ANON #endif -#if defined(LOONGSON3A) || defined(LOONGSON3B) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #define PREFETCHD_(x) ld $0, x #define PREFETCHD(x) PREFETCHD_(x) #else diff --git a/cpuid_mips64.c b/cpuid_mips64.c index 0c19ac1e7..674b65908 100644 --- a/cpuid_mips64.c +++ b/cpuid_mips64.c @@ -70,19 +70,19 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* or implied, of The University of Texas at Austin. */ /*********************************************************************/ -#define CPU_UNKNOWN 0 -#define CPU_SICORTEX 1 -#define CPU_LOONGSON3A 2 -#define CPU_LOONGSON3B 3 -#define CPU_I6400 4 -#define CPU_P6600 5 -#define CPU_I6500 6 +#define CPU_UNKNOWN 0 +#define CPU_SICORTEX 1 +#define CPU_LOONGSON3R3 2 +#define CPU_LOONGSON3R4 3 +#define CPU_I6400 4 +#define CPU_P6600 5 +#define CPU_I6500 6 static char *cpuname[] = { "UNKNOWN", "SICORTEX", - "LOONGSON3A", - "LOONGSON3B", + "LOONGSON3R3", + "LOONGSON3R4", "I6400", "P6600", "I6500" @@ -90,48 +90,13 @@ static char *cpuname[] = { int detect(void){ -#ifdef __linux +#ifdef linux FILE *infile; char buffer[512], *p; p = (char *)NULL; - infile = fopen("/proc/cpuinfo", "r"); - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("cpu", buffer, 3)){ - p = strchr(buffer, ':') + 2; -#if 0 - fprintf(stderr, "%s\n", p); -#endif - break; - } - } - - fclose(infile); - - if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - }else if (strstr(p, "Loongson-3")){ - infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; - while (fgets(buffer, sizeof(buffer), infile)){ - if (!strncmp("system type", buffer, 11)){ - p = strchr(buffer, ':') + 2; - break; - } - } - fclose(infile); - if (strstr(p, "loongson3a")) - return CPU_LOONGSON3A; - }else{ - return CPU_SICORTEX; - } - } //Check model name for Loongson3 infile = fopen("/proc/cpuinfo", "r"); - p = (char *)NULL; while (fgets(buffer, sizeof(buffer), infile)){ if (!strncmp("model name", buffer, 10)){ p = strchr(buffer, ':') + 2; @@ -140,14 +105,16 @@ int detect(void){ } fclose(infile); if(p != NULL){ - if (strstr(p, "Loongson-3A")){ - return CPU_LOONGSON3A; - }else if(strstr(p, "Loongson-3B")){ - return CPU_LOONGSON3B; - } + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")){ + return CPU_LOONGSON3R3; + }else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")){ + return CPU_LOONGSON3R4; + } else{ + return CPU_SICORTEX; } #endif return CPU_UNKNOWN; + } } char *get_corename(void){ @@ -159,10 +126,10 @@ void get_architecture(void){ } void get_subarchitecture(void){ - if(detect()==CPU_LOONGSON3A) { - printf("LOONGSON3A"); - }else if(detect()==CPU_LOONGSON3B){ - printf("LOONGSON3B"); + if(detect()==CPU_LOONGSON3R3) { + printf("LOONGSON3R3"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("LOONGSON3R4"); }else if(detect()==CPU_I6400){ printf("I6400"); }else if(detect()==CPU_P6600){ @@ -179,8 +146,8 @@ void get_subdirname(void){ } void get_cpuconfig(void){ - if(detect()==CPU_LOONGSON3A) { - printf("#define LOONGSON3A\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("#define LOONGSON3R3\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -188,8 +155,8 @@ void get_cpuconfig(void){ printf("#define DTB_DEFAULT_ENTRIES 64\n"); printf("#define DTB_SIZE 4096\n"); printf("#define L2_ASSOCIATIVE 4\n"); - }else if(detect()==CPU_LOONGSON3B){ - printf("#define LOONGSON3B\n"); + }else if(detect()==CPU_LOONGSON3R4){ + printf("#define LOONGSON3R4\n"); printf("#define L1_DATA_SIZE 65536\n"); printf("#define L1_DATA_LINESIZE 32\n"); printf("#define L2_SIZE 512488\n"); @@ -237,10 +204,10 @@ void get_cpuconfig(void){ } void get_libname(void){ - if(detect()==CPU_LOONGSON3A) { - printf("loongson3a\n"); - }else if(detect()==CPU_LOONGSON3B) { - printf("loongson3b\n"); + if(detect()==CPU_LOONGSON3R3) { + printf("loongson3r3\n"); + }else if(detect()==CPU_LOONGSON3R4) { + printf("loongson3r4\n"); }else if(detect()==CPU_I6400) { printf("i6400\n"); }else if(detect()==CPU_P6600) { diff --git a/driver/level3/level3.c b/driver/level3/level3.c index a38506585..9b44deb85 100644 --- a/driver/level3/level3.c +++ b/driver/level3/level3.c @@ -339,8 +339,10 @@ int CNAME(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else - if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; +/* + if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif diff --git a/driver/level3/level3_thread.c b/driver/level3/level3_thread.c index 6e1fd9e99..2b33c9589 100644 --- a/driver/level3/level3_thread.c +++ b/driver/level3/level3_thread.c @@ -373,8 +373,10 @@ static int inner_thread(blas_arg_t *args, BLASLONG *range_m, BLASLONG *range_n, #else if (min_jj >= 3*GEMM_UNROLL_N) min_jj = 3*GEMM_UNROLL_N; else +/* if (min_jj >= 2*GEMM_UNROLL_N) min_jj = 2*GEMM_UNROLL_N; else +*/ if (min_jj > GEMM_UNROLL_N) min_jj = GEMM_UNROLL_N; #endif /* Copy part of local region of B into workspace */ diff --git a/driver/others/Makefile b/driver/others/Makefile index d09444f56..4a421ef31 100644 --- a/driver/others/Makefile +++ b/driver/others/Makefile @@ -24,10 +24,14 @@ else ifeq ($(ARCH),zarch) COMMONOBJS += dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +COMMONOBJS += dynamic_mips64.$(SUFFIX) +else COMMONOBJS += dynamic.$(SUFFIX) endif endif endif +endif else COMMONOBJS += parameter.$(SUFFIX) endif @@ -92,10 +96,14 @@ else ifeq ($(ARCH),zarch) HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_zarch.$(SUFFIX) else +ifeq ($(ARCH),mips64) +HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_mips64.$(SUFFIX) +else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX) endif endif endif +endif else HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX) endif diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 30e0cc6c2..5e0943c2e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -967,9 +967,11 @@ void goto_set_num_threads(int num_threads) { blas_cpu_number = num_threads; #if defined(ARCH_MIPS64) +#ifndef DYNAMIC_ARCH //set parameters for different number of threads. blas_set_parameter(); #endif +#endif } diff --git a/driver/others/dynamic_mips64.c b/driver/others/dynamic_mips64.c new file mode 100644 index 000000000..9fd19d739 --- /dev/null +++ b/driver/others/dynamic_mips64.c @@ -0,0 +1,230 @@ +/***************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +#include +#include +#include +#include +#include +#include +#include "common.h" + +extern gotoblas_t gotoblas_LOONGSON3R3; +extern gotoblas_t gotoblas_LOONGSON3R4; + +extern void openblas_warning(int verbose, const char * msg); + +#define NUM_CORETYPES 2 + +static char *corename[] = { + "loongson3r3", + "loongson3r4", + "UNKNOWN" +}; + +char *gotoblas_corename(void) { + if (gotoblas == &gotoblas_LOONGSON3R3) return corename[0]; + if (gotoblas == &gotoblas_LOONGSON3R4) return corename[1]; + return corename[NUM_CORETYPES]; +} + +static gotoblas_t *force_coretype(char *coretype) { + int i; + int found = -1; + char message[128]; + + for ( i=0 ; i < NUM_CORETYPES; i++) + { + if (!strncasecmp(coretype, corename[i], 20)) + { + found = i; + break; + } + } + + switch (found) + { + case 0: return (&gotoblas_LOONGSON3R3); + case 1: return (&gotoblas_LOONGSON3R4); + } + snprintf(message, 128, "Core not found: %s\n", coretype); + openblas_warning(1, message); + return NULL; +} + +#define MMI_MASK 0x00000010 +#define MSA_MASK 0x00000020 + +int fd[2]; +int support_cpucfg; + +static void handler(int signum) +{ + close(fd[1]); + exit(1); +} + +/* Brief : Function to check if cpucfg supported on loongson + * Return: 1 supported + * 0 not supported + */ +static int cpucfg_test(void) { + pid_t pid; + int status = 0; + + support_cpucfg = 0; + pipe(fd); + pid = fork(); + if (pid == 0) { /* Subprocess */ + struct sigaction act; + close(fd[0]); + /* Set signal action for SIGILL. */ + act.sa_handler = handler; + sigaction(SIGILL,&act,NULL); + + /* Execute cpucfg in subprocess. */ + __asm__ volatile( + ".insn \n\t" + ".word (0xc8080118) \n\t" + ::: + ); + support_cpucfg = 1; + write(fd[1],&support_cpucfg,sizeof(support_cpucfg)); + close(fd[1]); + exit(0); + } else if (pid > 0){ /* Parent process*/ + close(fd[1]); + if ((waitpid(pid,&status,0) <= 0) || + (read(fd[0],&support_cpucfg,sizeof(support_cpucfg)) <= 0)) + support_cpucfg = 0; + close(fd[0]); + } else { + support_cpucfg = 0; + } + + return support_cpucfg; +} + +static gotoblas_t *get_coretype_from_cpucfg(void) { + int flag = 0; + __asm__ volatile( + ".insn \n\t" + "dli $8, 0x01 \n\t" + ".word (0xc9084918) \n\t" + "usw $9, 0x00(%0) \n\t" + : + : "r"(&flag) + : "memory" + ); + if (flag & MSA_MASK) + return (&gotoblas_LOONGSON3R4); + if (flag & MMI_MASK) + return (&gotoblas_LOONGSON3R3); + return NULL; +} + +static gotoblas_t *get_coretype_from_cpuinfo(void) { +#ifdef linux + FILE *infile; + char buffer[512], *p; + + p = (char *)NULL; + //Check model name for Loongson3 + infile = fopen("/proc/cpuinfo", "r"); + while (fgets(buffer, sizeof(buffer), infile)){ + if (!strncmp("model name", buffer, 10)){ + p = strchr(buffer, ':') + 2; + break; + } + } + fclose(infile); + if(p != NULL){ + if (strstr(p, "Loongson-3A3000") || strstr(p, "Loongson-3B3000")) + return (&gotoblas_LOONGSON3R3); + else if(strstr(p, "Loongson-3A4000") || strstr(p, "Loongson-3B4000")) + return (&gotoblas_LOONGSON3R4); + else + return NULL; + } +#endif + return NULL; +} + +static gotoblas_t *get_coretype(void) { + int ret = 0; + + ret = cpucfg_test(); + if (ret == 1) + return get_coretype_from_cpucfg(); + else + return get_coretype_from_cpuinfo(); +} + +void gotoblas_dynamic_init(void) { + char coremsg[128]; + char coren[22]; + char *p; + + if (gotoblas) return; + + p = getenv("OPENBLAS_CORETYPE"); + if ( p ) + { + gotoblas = force_coretype(p); + } + else + { + gotoblas = get_coretype(); + } + + if (gotoblas == NULL) + { + snprintf(coremsg, 128, "Falling back to loongson3r3 core\n"); + openblas_warning(1, coremsg); + gotoblas = &gotoblas_LOONGSON3R3; + } + + if (gotoblas && gotoblas->init) { + strncpy(coren, gotoblas_corename(), 20); + sprintf(coremsg, "Core: %s\n", coren); + openblas_warning(2, coremsg); + gotoblas -> init(); + } else { + openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n"); + exit(1); + } + +} + +void gotoblas_dynamic_quit(void) { + gotoblas = NULL; +} diff --git a/driver/others/dynamic_power.c b/driver/others/dynamic_power.c index 85fc5b3ba..d60ae68fc 100644 --- a/driver/others/dynamic_power.c +++ b/driver/others/dynamic_power.c @@ -52,6 +52,9 @@ static gotoblas_t *get_coretype(void) { if (__builtin_cpu_supports ("arch_3_1") && __builtin_cpu_supports ("mma")) return &gotoblas_POWER10; #endif + /* Fall back to the POWER9 implementation if the toolchain is too old or the MMA feature is not set */ + if (__builtin_cpu_is("power10")) + return &gotoblas_POWER9; return NULL; } diff --git a/driver/others/parameter.c b/driver/others/parameter.c index 35fc0a253..36da13369 100644 --- a/driver/others/parameter.c +++ b/driver/others/parameter.c @@ -717,7 +717,7 @@ void blas_set_parameter(void){ #if defined(ARCH_MIPS64) void blas_set_parameter(void){ -#if defined(LOONGSON3A) +#if defined(LOONGSON3R3) || defined(LOONGSON3R4) #ifdef SMP if(blas_num_threads == 1){ #endif @@ -731,20 +731,6 @@ void blas_set_parameter(void){ #endif #endif -#if defined(LOONGSON3B) -#ifdef SMP - if(blas_num_threads == 1 || blas_num_threads == 2){ -#endif - //single thread - dgemm_r = 640; -#ifdef SMP - }else{ - //multi thread - dgemm_r = 160; - } -#endif -#endif - } #endif diff --git a/getarch.c b/getarch.c index 9344defb5..29671736e 100644 --- a/getarch.c +++ b/getarch.c @@ -140,8 +140,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. /* #define FORCE_PPC440FP2 */ /* #define FORCE_CELL */ /* #define FORCE_SICORTEX */ -/* #define FORCE_LOONGSON3A */ -/* #define FORCE_LOONGSON3B */ +/* #define FORCE_LOONGSON3R3 */ +/* #define FORCE_LOONGSON3R4 */ /* #define FORCE_I6400 */ /* #define FORCE_P6600 */ /* #define FORCE_P5600 */ @@ -814,31 +814,31 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #endif -#ifdef FORCE_LOONGSON3A +#if defined FORCE_LOONGSON3R3 || defined FORCE_LOONGSON3A || defined FORCE_LOONGSON3B #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3A" +#define SUBARCHITECTURE "LOONGSON3R3" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3A " \ +#define ARCHCONFIG "-DLOONGSON3R3 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3a" -#define CORENAME "LOONGSON3A" +#define LIBNAME "loongson3r3" +#define CORENAME "LOONGSON3R3" #else #endif -#ifdef FORCE_LOONGSON3B +#ifdef FORCE_LOONGSON3R4 #define FORCE #define ARCHITECTURE "MIPS" -#define SUBARCHITECTURE "LOONGSON3B" +#define SUBARCHITECTURE "LOONGSON3R4" #define SUBDIRNAME "mips64" -#define ARCHCONFIG "-DLOONGSON3B " \ +#define ARCHCONFIG "-DLOONGSON3R4 " \ "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \ "-DL2_SIZE=512488 -DL2_LINESIZE=32 " \ "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 " -#define LIBNAME "loongson3b" -#define CORENAME "LOONGSON3B" +#define LIBNAME "loongson3r4" +#define CORENAME "LOONGSON3R4" #else #endif diff --git a/kernel/Makefile b/kernel/Makefile index fb1d5d39a..4e86546b9 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -58,6 +58,8 @@ else ifeq ($(TARGET_CORE), SKYLAKEX) endif else ifeq ($(TARGET_CORE), HASWELL) override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(AVX2OPT) +else ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) $(MSA_FLAGS) else override CFLAGS += -DBUILD_KERNEL -DTABLE_NAME=gotoblas_$(TARGET_CORE) endif @@ -68,6 +70,9 @@ else TARGET_CORE = $(CORE) KDIR = TSUFFIX = +ifeq ($(TARGET_CORE), LOONGSON3R4) + override CFLAGS += $(MSA_FLAGS) +endif endif -include $(KERNELDIR)/KERNEL.$(TARGET_CORE) diff --git a/kernel/Makefile.L3 b/kernel/Makefile.L3 index 893713769..d8d739965 100644 --- a/kernel/Makefile.L3 +++ b/kernel/Makefile.L3 @@ -29,10 +29,6 @@ ifeq ($(ARCH), riscv64) USE_TRMM = 1 endif -ifeq ($(TARGET), LOONGSON3B) -USE_TRMM = 1 -endif - ifneq ($(DYNAMIC_ARCH), 1) ifeq ($(TARGET), GENERIC) USE_TRMM = 1 diff --git a/kernel/mips/cgemm_kernel_8x4_msa.c b/kernel/mips/cgemm_kernel_8x4_msa.c index 8b624be88..aa3f1dcfa 100644 --- a/kernel/mips/cgemm_kernel_8x4_msa.c +++ b/kernel/mips/cgemm_kernel_8x4_msa.c @@ -121,7 +121,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_8X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP4_INC(pa0, 4, src_a0, src_a1, src_a2, src_a3); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ @@ -200,7 +200,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_KERNEL_4X1_MSA(OP0, OP1, OP2, OP3, OP4) \ { \ LD_SP2_INC(pa0, 4, src_a0, src_a1); \ - src_bi = (v4f32) __msa_cast_to_vector_double(*((double *) pb0)); \ + src_bi = (v4f32) COPY_DOUBLE_TO_VECTOR(*((double *) pb0)); \ SPLATI_W2_SP(src_bi, 0, src_br, src_bi); \ \ PCKEVOD_W2_SP(src_a1, src_a0, src_a0r, src_a0i); \ diff --git a/kernel/mips/crot_msa.c b/kernel/mips/crot_msa.c index 5273e38a3..84eb54d6d 100644 --- a/kernel/mips/crot_msa.c +++ b/kernel/mips/crot_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 2 elements */ for (j = (n >> 1); j--;) diff --git a/kernel/mips/cscal_msa.c b/kernel/mips/cscal_msa.c index 11a1450cf..451d0c921 100644 --- a/kernel/mips/cscal_msa.c +++ b/kernel/mips/cscal_msa.c @@ -49,11 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dscal_msa.c b/kernel/mips/dscal_msa.c index 6ce0375ab..2e41d8bef 100644 --- a/kernel/mips/dscal_msa.c +++ b/kernel/mips/dscal_msa.c @@ -44,9 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 5); i--;) { diff --git a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c index 9fb5141ca..e2cd3aa4b 100644 --- a/kernel/mips/dtrsm_kernel_LN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LN_8x4_msa.c @@ -186,8 +186,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(src_c14, src_c10, res_c12, res_c13); ILVRL_D2_DP(src_c15, src_c11, res_c14, res_c15); - src_a54 = __msa_cast_to_vector_double(*(a + 54)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a + 54)); src_a62 = LD_DP(a + 62); src_a63 = (v2f64) __msa_splati_d((v2i64) src_a62, 1); src_a62 = (v2f64) __msa_splati_d((v2i64) src_a62, 0); @@ -200,8 +199,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a44 = LD_DP(a + 44); src_a45 = (v2f64) __msa_splati_d((v2i64) src_a44, 1); src_a44 = (v2f64) __msa_splati_d((v2i64) src_a44, 0); - src_a36 = __msa_cast_to_vector_double(*(a + 36)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a + 36)); res_c7 *= src_a63; res_c6 -= res_c7 * src_a62; @@ -271,8 +269,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a26 = LD_DP(a + 26); src_a27 = (v2f64) __msa_splati_d((v2i64) src_a26, 1); src_a26 = (v2f64) __msa_splati_d((v2i64) src_a26, 0); - src_a18 = __msa_cast_to_vector_double(*(a + 18)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a + 18)); res_c3 -= res_c7 * src_a59; res_c2 -= res_c7 * src_a58; @@ -358,8 +355,7 @@ void dsolve_8x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_a8 = LD_DP(a + 8); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); res_c1 -= res_c2 * src_a17; res_c1 *= src_a9; @@ -488,8 +484,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a52 = LD_DP(a - 12); src_a53 = (v2f64) __msa_splati_d((v2i64) src_a52, 1); src_a52 = (v2f64) __msa_splati_d((v2i64) src_a52, 0); - src_a54 = __msa_cast_to_vector_double(*(a - 10)); - src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); + src_a54 = COPY_DOUBLE_TO_VECTOR(*(a -10)); src_a40 = LD_DP(a - 24); src_a41 = (v2f64) __msa_splati_d((v2i64) src_a40, 1); @@ -526,8 +521,7 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a34 = LD_DP(a - 30); src_a35 = (v2f64) __msa_splati_d((v2i64) src_a34, 1); src_a34 = (v2f64) __msa_splati_d((v2i64) src_a34, 0); - src_a36 = __msa_cast_to_vector_double(*(a - 28)); - src_a36 = (v2f64) __msa_splati_d((v2i64) src_a36, 0); + src_a36 = COPY_DOUBLE_TO_VECTOR(*(a -28)); res_c4 *= src_a36; res_c3 -= res_c4 * src_a35; @@ -544,10 +538,8 @@ static void dsolve_8x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a16 = LD_DP(a - 48); src_a17 = (v2f64) __msa_splati_d((v2i64) src_a16, 1); src_a16 = (v2f64) __msa_splati_d((v2i64) src_a16, 0); - src_a18 = __msa_cast_to_vector_double(*(a - 46)); - src_a18 = (v2f64) __msa_splati_d((v2i64) src_a18, 0); - src_a0 = __msa_cast_to_vector_double(*(a - 64)); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a18 = COPY_DOUBLE_TO_VECTOR(*(a - 46)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a - 64)); src_a8 = LD_DP(a - 56); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a8, 1); src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); @@ -785,11 +777,8 @@ static void dsolve_4x4_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); @@ -890,11 +879,8 @@ static void dsolve_4x2_ln_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = (v2f64) __msa_splati_d((v2i64) src_a9, 1); src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); - src_a8 = __msa_cast_to_vector_double(*(a + 8)); - src_a0 = __msa_cast_to_vector_double(*(a + 0)); - - src_a8 = (v2f64) __msa_splati_d((v2i64) src_a8, 0); - src_a0 = (v2f64) __msa_splati_d((v2i64) src_a0, 0); + src_a8 = COPY_DOUBLE_TO_VECTOR(*(a + 8)); + src_a0 = COPY_DOUBLE_TO_VECTOR(*(a + 0)); src_a4 = LD_DP(a + 4); src_a5 = (v2f64) __msa_splati_d((v2i64) src_a4, 1); diff --git a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c index 525fc8585..74cc1278a 100644 --- a/kernel/mips/dtrsm_kernel_LT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_LT_8x4_msa.c @@ -215,8 +215,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c8 * src_a6; res_c15 -= res_c8 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -280,8 +279,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c10 * src_a22; res_c15 -= res_c10 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -326,8 +324,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) res_c14 -= res_c12 * src_a38; res_c15 -= res_c12 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -353,8 +350,7 @@ void dsolve_8x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) ILVRL_D2_DP(res_c5, res_c4, src_c2, src_c6); ILVRL_D2_DP(res_c13, res_c12, src_c10, src_c14); - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -478,8 +474,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c0 * src_a6; res_c7 -= res_c0 * src_a7; - src_a9 = __msa_cast_to_vector_double(*(a + 9)); - src_a9 = (v2f64) __msa_splati_d((v2i64) src_a9, 0); + src_a9 = COPY_DOUBLE_TO_VECTOR(*(a + 9)); src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); @@ -515,8 +510,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c2 * src_a22; res_c7 -= res_c2 * src_a23; - src_a27 = __msa_cast_to_vector_double(*(a + 27)); - src_a27 = (v2f64) __msa_splati_d((v2i64) src_a27, 0); + src_a27 = COPY_DOUBLE_TO_VECTOR(*(a + 27)); src_a28 = LD_DP(a + 28); src_a29 = (v2f64) __msa_splati_d((v2i64) src_a28, 1); src_a28 = (v2f64) __msa_splati_d((v2i64) src_a28, 0); @@ -553,8 +547,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a38; res_c7 -= res_c4 * src_a39; - src_a45 = __msa_cast_to_vector_double(*(a + 45)); - src_a45 = (v2f64) __msa_splati_d((v2i64) src_a45, 0); + src_a45 = COPY_DOUBLE_TO_VECTOR(*(a + 45)); src_a46 = LD_DP(a + 46); src_a47 = (v2f64) __msa_splati_d((v2i64) src_a46, 1); src_a46 = (v2f64) __msa_splati_d((v2i64) src_a46, 0); @@ -563,8 +556,7 @@ static void dsolve_8x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c5 * src_a46; res_c7 -= res_c5 * src_a47; - src_a63 = __msa_cast_to_vector_double(*(a + 63)); - src_a63 = (v2f64) __msa_splati_d((v2i64) src_a63, 0); + src_a63 = COPY_DOUBLE_TO_VECTOR(*(a + 63)); src_a54 = LD_DP(a + 54); src_a55 = (v2f64) __msa_splati_d((v2i64) src_a54, 1); src_a54 = (v2f64) __msa_splati_d((v2i64) src_a54, 0); @@ -786,8 +778,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c6 -= res_c4 * src_a2; res_c7 -= res_c4 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -803,8 +794,7 @@ static void dsolve_4x4_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; @@ -881,8 +871,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO res_c2 -= res_c0 * src_a2; res_c3 -= res_c0 * src_a3; - src_a5 = __msa_cast_to_vector_double(*(a + 5)); - src_a5 = (v2f64) __msa_splati_d((v2i64) src_a5, 0); + src_a5 = COPY_DOUBLE_TO_VECTOR(*(a + 5)); src_a6 = LD_DP(a + 6); src_a7 = (v2f64) __msa_splati_d((v2i64) src_a6, 1); src_a6 = (v2f64) __msa_splati_d((v2i64) src_a6, 0); @@ -894,8 +883,7 @@ static void dsolve_4x2_lt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_a10 = LD_DP(a + 10); src_a11 = (v2f64) __msa_splati_d((v2i64) src_a10, 1); src_a10 = (v2f64) __msa_splati_d((v2i64) src_a10, 0); - src_a15 = __msa_cast_to_vector_double(*(a + 15)); - src_a15 = (v2f64) __msa_splati_d((v2i64) src_a15, 0); + src_a15 = COPY_DOUBLE_TO_VECTOR(*(a + 15)); res_c2 *= src_a10; res_c3 -= res_c2 * src_a11; diff --git a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c index cb361c511..03036f1c7 100644 --- a/kernel/mips/dtrsm_kernel_RN_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RN_8x4_msa.c @@ -161,16 +161,14 @@ void dsolve_8x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -294,8 +292,7 @@ static void dsolve_8x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -347,8 +344,7 @@ static void dsolve_8x1_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) } } - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -407,16 +403,14 @@ static void dsolve_4x4_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); - src_b5 = __msa_cast_to_vector_double(*(b + 5)); - src_b5 = (v2f64) __msa_splati_d((v2i64) src_b5, 0); + src_b5 = COPY_DOUBLE_TO_VECTOR(*(b + 5)); src_b6 = LD_DP(b + 6); src_b7 = (v2f64) __msa_splati_d((v2i64) src_b6, 1); src_b6 = (v2f64) __msa_splati_d((v2i64) src_b6, 0); src_b10 = LD_DP(b + 10); src_b11 = (v2f64) __msa_splati_d((v2i64) src_b10, 1); src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); - src_b15 = __msa_cast_to_vector_double(*(b + 15)); - src_b15 = (v2f64) __msa_splati_d((v2i64) src_b15, 0); + src_b15 = COPY_DOUBLE_TO_VECTOR(*(b + 15)); src_c0 *= src_b0; src_c1 *= src_b0; @@ -490,8 +484,7 @@ static void dsolve_4x2_rn_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b0 = LD_DP(b + 0); src_b1 = (v2f64) __msa_splati_d((v2i64) src_b0, 1); src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); - src_b3 = __msa_cast_to_vector_double(*(b + 3)); - src_b3 = (v2f64) __msa_splati_d((v2i64) src_b3, 0); + src_b3 = COPY_DOUBLE_TO_VECTOR(*(b + 3)); src_c0 *= src_b0; src_c1 *= src_b0; diff --git a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c index 581a90f71..4c55a0f37 100644 --- a/kernel/mips/dtrsm_kernel_RT_8x4_msa.c +++ b/kernel/mips/dtrsm_kernel_RT_8x4_msa.c @@ -168,11 +168,9 @@ void dsolve_8x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG bk) src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -298,8 +296,7 @@ static void dsolve_8x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 16; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); @@ -377,8 +374,7 @@ static void dsolve_8x1_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG bk) a -= 8; b -= 1; - src_b0 = __msa_cast_to_vector_double(*b); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*b); src_c0 *= src_b0; src_c1 *= src_b0; @@ -445,11 +441,9 @@ static void dsolve_4x4_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO src_b8 = LD_DP(b + 8); src_b9 = (v2f64) __msa_splati_d((v2i64) src_b8, 1); src_b8 = (v2f64) __msa_splati_d((v2i64) src_b8, 0); - src_b10 = __msa_cast_to_vector_double(*(b + 10)); - src_b10 = (v2f64) __msa_splati_d((v2i64) src_b10, 0); + src_b10 = COPY_DOUBLE_TO_VECTOR(*(b + 10)); - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b4 = LD_DP(b + 4); src_b5 = (v2f64) __msa_splati_d((v2i64) src_b4, 1); src_b4 = (v2f64) __msa_splati_d((v2i64) src_b4, 0); @@ -527,8 +521,7 @@ static void dsolve_4x2_rt_msa(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLO a -= 8; b -= 4; - src_b0 = __msa_cast_to_vector_double(*(b + 0)); - src_b0 = (v2f64) __msa_splati_d((v2i64) src_b0, 0); + src_b0 = COPY_DOUBLE_TO_VECTOR(*(b + 0)); src_b2 = LD_DP(b + 2); src_b3 = (v2f64) __msa_splati_d((v2i64) src_b2, 1); src_b2 = (v2f64) __msa_splati_d((v2i64) src_b2, 0); diff --git a/kernel/mips/macros_msa.h b/kernel/mips/macros_msa.h index ee0dea0b7..b887800ed 100644 --- a/kernel/mips/macros_msa.h +++ b/kernel/mips/macros_msa.h @@ -63,16 +63,12 @@ inline static void prefetch_load_lf(unsigned char *src) #define ST_DP(...) ST_D(v2f64, __VA_ARGS__) #define COPY_FLOAT_TO_VECTOR(a) ( { \ - v4f32 out; \ - out = __msa_cast_to_vector_float(a); \ - out = (v4f32) __msa_splati_w((v4i32) out, 0); \ + v4f32 out = {a, a, a, a}; \ out; \ } ) #define COPY_DOUBLE_TO_VECTOR(a) ( { \ - v2f64 out; \ - out = __msa_cast_to_vector_double(a); \ - out = (v2f64) __msa_splati_d((v2i64) out, 0); \ + v2f64 out = {a, a}; \ out; \ } ) diff --git a/kernel/mips/srot_msa.c b/kernel/mips/srot_msa.c index 75730241a..79d921b7a 100644 --- a/kernel/mips/srot_msa.c +++ b/kernel/mips/srot_msa.c @@ -48,11 +48,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, { if ((0 == c) && (0 == s)) { - v4f32 zero = __msa_cast_to_vector_float(0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 0, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 1, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 2, 0.0); - zero = (v4f32) __msa_insert_w((v4i32) zero, 3, 0.0); + v4f32 zero = {0.0, 0.0, 0.0, 0.0}; /* process 4 floats */ for (j = (n >> 2); j--;) diff --git a/kernel/mips/sscal_msa.c b/kernel/mips/sscal_msa.c index 64b62d659..66e17b844 100644 --- a/kernel/mips/sscal_msa.c +++ b/kernel/mips/sscal_msa.c @@ -44,11 +44,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, { if (0.0 == da) { - v4f32 zero_v = __msa_cast_to_vector_float(0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 0, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 1, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 2, 0.0); - zero_v = (v4f32) __msa_insert_w((v4i32) zero_v, 3, 0.0); + v4f32 zero_v = {0.0, 0.0, 0.0, 0.0}; for (i = (n >> 6); i--;) { diff --git a/kernel/mips/zscal_msa.c b/kernel/mips/zscal_msa.c index 5a8766d3c..a45c3cecd 100644 --- a/kernel/mips/zscal_msa.c +++ b/kernel/mips/zscal_msa.c @@ -49,9 +49,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, { if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { @@ -475,9 +473,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i, if ((0.0 == da_r) && (0.0 == da_i)) { - v2f64 zero_v = __msa_cast_to_vector_double(0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 0, 0.0); - zero_v = (v2f64) __msa_insert_d((v2i64) zero_v, 1, 0.0); + v2f64 zero_v = {0.0, 0.0}; for (i = (n >> 4); i--;) { diff --git a/kernel/mips64/KERNEL.LOONGSON3B b/kernel/mips64/KERNEL.LOONGSON3B deleted file mode 100644 index e476c631e..000000000 --- a/kernel/mips64/KERNEL.LOONGSON3B +++ /dev/null @@ -1,64 +0,0 @@ -SAXPYKERNEL=axpy_loongson3a.S -DAXPYKERNEL=daxpy_loongson3a_simd.S - -SGEMVNKERNEL = gemv_n_loongson3a.c -SGEMVTKERNEL = gemv_t_loongson3a.c -DGEMVNKERNEL = gemv_n_loongson3a.c -DGEMVTKERNEL = gemv_t_loongson3a.c -CGEMVNKERNEL = zgemv_n_loongson3a.c -CGEMVTKERNEL = zgemv_t_loongson3a.c -ZGEMVNKERNEL = zgemv_n_loongson3a.c -ZGEMVTKERNEL = zgemv_t_loongson3a.c - -STRMMKERNEL = ../generic/trmmkernel_2x2.c -DTRMMKERNEL = ../generic/trmmkernel_2x2.c -CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c -ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c - -SGEMMKERNEL = ../generic/gemmkernel_2x2.c -SGEMMONCOPY = ../generic/gemm_ncopy_2.c -SGEMMOTCOPY = ../generic/gemm_tcopy_2.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -DGEMMKERNEL = ../generic/gemmkernel_2x2.c -DGEMMONCOPY = ../generic/gemm_ncopy_2.c -DGEMMOTCOPY = ../generic/gemm_tcopy_2.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -CGEMMKERNEL = ../generic/zgemmkernel_2x2.c -CGEMMONCOPY = ../generic/zgemm_ncopy_2.c -CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o - -ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c -ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c -ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o - -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - -ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c - - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3A b/kernel/mips64/KERNEL.LOONGSON3R3 similarity index 75% rename from kernel/mips64/KERNEL.LOONGSON3A rename to kernel/mips64/KERNEL.LOONGSON3R3 index 0298faaad..904828d57 100644 --- a/kernel/mips64/KERNEL.LOONGSON3A +++ b/kernel/mips64/KERNEL.LOONGSON3R3 @@ -16,32 +16,32 @@ SGEMMINCOPY = ../generic/gemm_ncopy_8.c SGEMMITCOPY = ../generic/gemm_tcopy_8.c SGEMMONCOPY = ../generic/gemm_ncopy_4.c SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMINCOPYOBJ = sgemm_incopy.o -SGEMMITCOPYOBJ = sgemm_itcopy.o -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S DGEMMONCOPY = ../generic/gemm_ncopy_4.c DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S CGEMMINCOPY = ../generic/zgemm_ncopy_4.c CGEMMITCOPY = ../generic/zgemm_tcopy_4.c CGEMMONCOPY = ../generic/zgemm_ncopy_2.c CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -CGEMMINCOPYOBJ = cgemm_incopy.o -CGEMMITCOPYOBJ = cgemm_itcopy.o -CGEMMONCOPYOBJ = cgemm_oncopy.o -CGEMMOTCOPYOBJ = cgemm_otcopy.o +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c -ZGEMMONCOPYOBJ = zgemm_oncopy.o -ZGEMMOTCOPYOBJ = zgemm_otcopy.o +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c @@ -64,6 +64,3 @@ ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c DSDOTKERNEL = ../mips/dot.c - - - diff --git a/kernel/mips64/KERNEL.LOONGSON3R4 b/kernel/mips64/KERNEL.LOONGSON3R4 new file mode 100644 index 000000000..b81e5441d --- /dev/null +++ b/kernel/mips64/KERNEL.LOONGSON3R4 @@ -0,0 +1,192 @@ +ifdef HAVE_MSA +SAXPYKERNEL = ../mips/saxpy_msa.c +DAXPYKERNEL = ../mips/daxpy_msa.c +CAXPYKERNEL = ../mips/caxpy_msa.c +ZAXPYKERNEL = ../mips/zaxpy_msa.c +else +SAXPYKERNEL = axpy_loongson3a.S +DAXPYKERNEL = daxpy_loongson3a_simd.S +endif + +ifdef HAVE_MSA +SCOPYKERNEL = ../mips/scopy_msa.c +DCOPYKERNEL = ../mips/dcopy_msa.c +CCOPYKERNEL = ../mips/ccopy_msa.c +ZCOPYKERNEL = ../mips/zcopy_msa.c +endif + +ifdef HAVE_MSA +SDOTKERNEL = ../mips/sdot_msa.c +DDOTKERNEL = ../mips/ddot_msa.c +CDOTKERNEL = ../mips/cdot_msa.c +ZDOTKERNEL = ../mips/zdot_msa.c +endif +DSDOTKERNEL = ../mips/dot.c + +ifdef HAVE_MSA +SROTKERNEL = ../mips/srot_msa.c +DROTKERNEL = ../mips/drot_msa.c +CROTKERNEL = ../mips/crot_msa.c +ZROTKERNEL = ../mips/zrot_msa.c +endif + +ifdef HAVE_MSA +SSCALKERNEL = ../mips/sscal_msa.c +DSCALKERNEL = ../mips/dscal_msa.c +CSCALKERNEL = ../mips/cscal_msa.c +ZSCALKERNEL = ../mips/zscal_msa.c +endif + +ifdef HAVE_MSA +SGEMVNKERNEL = ../mips/sgemv_n_msa.c +DGEMVNKERNEL = ../mips/dgemv_n_msa.c +SGEMVTKERNEL = ../mips/sgemv_t_msa.c +DGEMVTKERNEL = ../mips/dgemv_t_msa.c +CGEMVNKERNEL = ../mips/cgemv_n_msa.c +CGEMVTKERNEL = ../mips/cgemv_t_msa.c +ZGEMVNKERNEL = ../mips/zgemv_n_msa.c +ZGEMVTKERNEL = ../mips/zgemv_t_msa.c +else +SGEMVNKERNEL = gemv_n_loongson3a.c +SGEMVTKERNEL = gemv_t_loongson3a.c +DGEMVNKERNEL = gemv_n_loongson3a.c +DGEMVTKERNEL = gemv_t_loongson3a.c +CGEMVNKERNEL = zgemv_n_loongson3a.c +CGEMVTKERNEL = zgemv_t_loongson3a.c +ZGEMVNKERNEL = zgemv_n_loongson3a.c +ZGEMVTKERNEL = zgemv_t_loongson3a.c +endif + +ifdef HAVE_MSA +SASUMKERNEL = ../mips/sasum_msa.c +DASUMKERNEL = ../mips/dasum_msa.c +CASUMKERNEL = ../mips/casum_msa.c +ZASUMKERNEL = ../mips/zasum_msa.c +endif + +ifdef HAVE_MSA +SSWAPKERNEL = ../mips/sswap_msa.c +DSWAPKERNEL = ../mips/dswap_msa.c +CSWAPKERNEL = ../mips/cswap_msa.c +ZSWAPKERNEL = ../mips/zswap_msa.c +endif + +ifdef HAVE_MSA +SGEMMKERNEL = ../mips/sgemm_kernel_8x8_msa.c +SGEMMONCOPY = ../mips/sgemm_ncopy_8_msa.c +SGEMMOTCOPY = ../mips/sgemm_tcopy_8_msa.c +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +SGEMMKERNEL = sgemm_kernel_8x4_ps.S +SGEMMINCOPY = ../generic/gemm_ncopy_8.c +SGEMMITCOPY = ../generic/gemm_tcopy_8.c +SGEMMONCOPY = ../generic/gemm_ncopy_4.c +SGEMMOTCOPY = ../generic/gemm_tcopy_4.c +SGEMMINCOPYOBJ = sgemm_incopy$(TSUFFIX).$(SUFFIX) +SGEMMITCOPYOBJ = sgemm_itcopy$(TSUFFIX).$(SUFFIX) +SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX) +SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +DGEMMKERNEL = ../mips/dgemm_kernel_8x4_msa.c +DGEMMINCOPY = ../mips/dgemm_ncopy_8_msa.c +DGEMMITCOPY = ../mips/dgemm_tcopy_8_msa.c +DGEMMONCOPY = ../mips/dgemm_ncopy_4_msa.c +DGEMMOTCOPY = ../mips/dgemm_tcopy_4_msa.c +DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX) +DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX) +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +DGEMMKERNEL = dgemm_kernel_loongson3a_4x4.S +DGEMMONCOPY = ../generic/gemm_ncopy_4.c +DGEMMOTCOPY = ../generic/gemm_tcopy_4.c +DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX) +DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +CGEMMKERNEL = ../mips/cgemm_kernel_8x4_msa.c +CGEMMINCOPY = ../mips/cgemm_ncopy_8_msa.c +CGEMMITCOPY = ../mips/cgemm_tcopy_8_msa.c +CGEMMONCOPY = ../mips/cgemm_ncopy_4_msa.c +CGEMMOTCOPY = ../mips/cgemm_tcopy_4_msa.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +CGEMMKERNEL = cgemm_kernel_loongson3a_4x2_ps.S +CGEMMINCOPY = ../generic/zgemm_ncopy_4.c +CGEMMITCOPY = ../generic/zgemm_tcopy_4.c +CGEMMONCOPY = ../generic/zgemm_ncopy_2.c +CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX) +CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX) +CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX) +CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +ZGEMMKERNEL = ../mips/zgemm_kernel_4x4_msa.c +ZGEMMONCOPY = ../mips/zgemm_ncopy_4_msa.c +ZGEMMOTCOPY = ../mips/zgemm_tcopy_4_msa.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +else +ZGEMMKERNEL = zgemm_kernel_loongson3a_2x2.S +ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c +ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c +ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX) +ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) +endif + +ifdef HAVE_MSA +STRSMKERNEL_LN = ../mips/strsm_kernel_LN_8x8_msa.c +STRSMKERNEL_LT = ../mips/strsm_kernel_LT_8x8_msa.c +STRSMKERNEL_RN = ../mips/strsm_kernel_RN_8x8_msa.c +STRSMKERNEL_RT = ../mips/strsm_kernel_RT_8x8_msa.c +else +STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +DTRSMKERNEL_LN = ../mips/dtrsm_kernel_LN_8x4_msa.c +DTRSMKERNEL_LT = ../mips/dtrsm_kernel_LT_8x4_msa.c +DTRSMKERNEL_RN = ../mips/dtrsm_kernel_RN_8x4_msa.c +DTRSMKERNEL_RT = ../mips/dtrsm_kernel_RT_8x4_msa.c +else +DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif + +ifdef HAVE_MSA +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +else +ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c +ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c +ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +endif diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index c25cd9f04..d61f5194a 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -63,15 +63,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c +DTRSMKERNEL_LT = trsm_kernel_LT_power10.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index ab8fbfcd9..2bd2516de 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -52,15 +52,15 @@ ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX) ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX) ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX) -STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c -STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c -STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +STRSMKERNEL_LN = trsm_kernel_LN_power10.c +STRSMKERNEL_LT = trsm_kernel_LT_power10.c +STRSMKERNEL_RN = trsm_kernel_RN_power10.c +STRSMKERNEL_RT = trsm_kernel_RT_power10.c -DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c +DTRSMKERNEL_LN = trsm_kernel_LN_power10.c DTRSMKERNEL_LT = dtrsm_kernel_LT_16x4_power8.S -DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c -DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c +DTRSMKERNEL_RN = trsm_kernel_RN_power10.c +DTRSMKERNEL_RT = trsm_kernel_RT_power10.c CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c diff --git a/kernel/power/caxpy_microk_power10.c b/kernel/power/caxpy_microk_power10.c index 0d13416b3..56a5ab47a 100644 --- a/kernel/power/caxpy_microk_power10.c +++ b/kernel/power/caxpy_microk_power10.c @@ -112,10 +112,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "addi %4, %4, 128 \n\t" "xxperm 52, 40, %x10 \n\t" // exchange real and imag part @@ -159,10 +163,14 @@ static void caxpy_kernel_8 (long n, float *x, float *y, "xvmaddasp 38, 58, 33 \n\t" "xvmaddasp 39, 59, 33 \n\t" - "stxvp 48, 0(%4) \n\t" - "stxvp 50, 32(%4) \n\t" - "stxvp 34, 64(%4) \n\t" - "stxvp 38, 96(%4) \n\t" + "stxv 49, 0(%4) \n\t" + "stxv 48, 16(%4) \n\t" + "stxv 51, 32(%4) \n\t" + "stxv 50, 48(%4) \n\t" + "stxv 35, 64(%4) \n\t" + "stxv 34, 80(%4) \n\t" + "stxv 39, 96(%4) \n\t" + "stxv 38, 112(%4) \n\t" "#n=%1 x=%5=%2 y=%0=%3 alpha=(%7,%8) mvecp=%6=%9 ytmp=%4\n" : diff --git a/kernel/power/daxpy_power10.c b/kernel/power/daxpy_power10.c index ebe91a80f..8640efcfd 100644 --- a/kernel/power/daxpy_power10.c +++ b/kernel/power/daxpy_power10.c @@ -66,12 +66,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -16; + if ( n >= 16 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -16; + if ( n1 ) + daxpy_kernel_8(n1, &x[i], &y[i], da); - if ( n1 ) - daxpy_kernel_8(n1, x, y, da); + i += n1; - i = n1; while(i < n) { diff --git a/kernel/power/saxpy_power10.c b/kernel/power/saxpy_power10.c index 8c7c22390..4a13c1f88 100644 --- a/kernel/power/saxpy_power10.c +++ b/kernel/power/saxpy_power10.c @@ -64,12 +64,18 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( (inc_x == 1) && (inc_y == 1) ) { - BLASLONG n1 = n & -64; - + if ( n >= 64 ) + { + BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; + for (i = 0; i < align; i++) { + y[i] += da * x[i] ; + } + } + BLASLONG n1 = (n-i) & -64; if ( n1 ) - saxpy_kernel_64(n1, x, y, da); + saxpy_kernel_64(n1, &x[i], &y[i], da); - i = n1; + i += n1; while(i < n) { diff --git a/kernel/power/trsm_kernel_LN_power10.c b/kernel/power/trsm_kernel_LN_power10.c new file mode 100644 index 000000000..5ca1603a6 --- /dev/null +++ b/kernel/power/trsm_kernel_LN_power10.c @@ -0,0 +1,1280 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + VbS7 = vec_splat(Vb[31], 1); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[29], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + c0[6] -= c0[7] * a[62]; + c1[6] -= c1[7] * a[62]; + c2[6] -= c2[7] * a[62]; + c3[6] -= c3[7] * a[62]; + c4[6] -= c4[7] * a[62]; + c5[6] -= c5[7] * a[62]; + c6[6] -= c6[7] * a[62]; + c7[6] -= c7[7] * a[62]; + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + VbS6 = vec_splat(Vb[27], 0); + VbS7 = vec_splat(Vb[27], 1); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[25], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[21], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[21], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[21], Vc7[1]); + c0[4] -= c0[5] * a[44]; + c1[4] -= c1[5] * a[44]; + c2[4] -= c2[5] * a[44]; + c3[4] -= c3[5] * a[44]; + c4[4] -= c4[5] * a[44]; + c5[4] -= c5[5] * a[44]; + c6[4] -= c6[5] * a[44]; + c7[4] -= c7[5] * a[44]; + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[17], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[17], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[17], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[17], Vc7[1]); + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[12], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[12], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[12], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[12], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[12], Vc7[0]); + c0[2] -= c0[3] * a[26]; + c1[2] -= c1[3] * a[26]; + c2[2] -= c2[3] * a[26]; + c3[2] -= c3[3] * a[26]; + c4[2] -= c4[3] * a[26]; + c5[2] -= c5[3] * a[26]; + c6[2] -= c6[3] * a[26]; + c7[2] -= c7[3] * a[26]; + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[0] = vec_nmsub(VbS0, Va[8], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[8], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[8], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[8], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[8], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[8], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[8], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[8], Vc7[0]); + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + c0[0] -= c0[1] * a[8]; + c1[0] -= c1[1] * a[8]; + c2[0] -= c2[1] * a[8]; + c3[0] -= c3[1] * a[8]; + c4[0] -= c4[1] * a[8]; + c5[0] -= c5[1] * a[8]; + c6[0] -= c6[1] * a[8]; + c7[0] -= c7[1] * a[8]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + int j; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); + VbS0 = vec_splat(Vb[30], 0); + VbS1 = vec_splat(Vb[30], 1); + VbS2 = vec_splat(Vb[30], 2); + VbS3 = vec_splat(Vb[30], 3); + VbS4 = vec_splat(Vb[31], 0); + VbS5 = vec_splat(Vb[31], 1); + VbS6 = vec_splat(Vb[31], 2); + VbS7 = vec_splat(Vb[31], 3); + Vc0[0] = vec_nmsub(VbS0, Va[60], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[61], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[62], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[60], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[61], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[62], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[60], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[61], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[62], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[60], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[61], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[62], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[60], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[61], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[62], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[60], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[61], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[62], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[60], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[61], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[62], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[60], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[61], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[62], Vc7[2]); + c0[12] -= b[120] * a[252]; + c0[13] -= b[120] * a[253]; + c0[14] -= b[120] * a[254]; + c1[12] -= b[121] * a[252]; + c1[13] -= b[121] * a[253]; + c1[14] -= b[121] * a[254]; + c2[12] -= b[122] * a[252]; + c2[13] -= b[122] * a[253]; + c2[14] -= b[122] * a[254]; + c3[12] -= b[123] * a[252]; + c3[13] -= b[123] * a[253]; + c3[14] -= b[123] * a[254]; + c4[12] -= b[124] * a[252]; + c4[13] -= b[124] * a[253]; + c4[14] -= b[124] * a[254]; + c5[12] -= b[125] * a[252]; + c5[13] -= b[125] * a[253]; + c5[14] -= b[125] * a[254]; + c6[12] -= b[126] * a[252]; + c6[13] -= b[126] * a[253]; + c6[14] -= b[126] * a[254]; + c7[12] -= b[127] * a[252]; + c7[13] -= b[127] * a[253]; + c7[14] -= b[127] * a[254]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[28], 2); + VbS3 = vec_splat(Vb[28], 3); + VbS4 = vec_splat(Vb[29], 0); + VbS5 = vec_splat(Vb[29], 1); + VbS6 = vec_splat(Vb[29], 2); + VbS7 = vec_splat(Vb[29], 3); + Vc0[0] = vec_nmsub(VbS0, Va[56], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[57], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[58], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[56], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[57], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[58], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[56], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[57], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[58], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[56], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[57], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[58], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[56], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[57], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[58], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[56], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[57], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[58], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[56], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[57], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[58], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[56], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[57], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[58], Vc7[2]); + c0[12] -= b[112] * a[236]; + c0[13] -= b[112] * a[237]; + c1[12] -= b[113] * a[236]; + c1[13] -= b[113] * a[237]; + c2[12] -= b[114] * a[236]; + c2[13] -= b[114] * a[237]; + c3[12] -= b[115] * a[236]; + c3[13] -= b[115] * a[237]; + c4[12] -= b[116] * a[236]; + c4[13] -= b[116] * a[237]; + c5[12] -= b[117] * a[236]; + c5[13] -= b[117] * a[237]; + c6[12] -= b[118] * a[236]; + c6[13] -= b[118] * a[237]; + c7[12] -= b[119] * a[236]; + c7[13] -= b[119] * a[237]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + VbS0 = vec_splat(Vb[26], 0); + VbS1 = vec_splat(Vb[26], 1); + VbS2 = vec_splat(Vb[26], 2); + VbS3 = vec_splat(Vb[26], 3); + VbS4 = vec_splat(Vb[27], 0); + VbS5 = vec_splat(Vb[27], 1); + VbS6 = vec_splat(Vb[27], 2); + VbS7 = vec_splat(Vb[27], 3); + Vc0[0] = vec_nmsub(VbS0, Va[52], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[53], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[54], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[52], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[53], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[54], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[52], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[53], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[54], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[52], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[53], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[54], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[52], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[53], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[54], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[52], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[53], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[54], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[52], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[53], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[54], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[52], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[53], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[54], Vc7[2]); + c0[12] -= b[104] * a[220]; + c1[12] -= b[105] * a[220]; + c2[12] -= b[106] * a[220]; + c3[12] -= b[107] * a[220]; + c4[12] -= b[108] * a[220]; + c5[12] -= b[109] * a[220]; + c6[12] -= b[110] * a[220]; + c7[12] -= b[111] * a[220]; + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[24], 2); + VbS3 = vec_splat(Vb[24], 3); + VbS4 = vec_splat(Vb[25], 0); + VbS5 = vec_splat(Vb[25], 1); + VbS6 = vec_splat(Vb[25], 2); + VbS7 = vec_splat(Vb[25], 3); + Vc0[0] = vec_nmsub(VbS0, Va[48], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[49], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[50], Vc0[2]); + Vc1[0] = vec_nmsub(VbS1, Va[48], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[49], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[50], Vc1[2]); + Vc2[0] = vec_nmsub(VbS2, Va[48], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[49], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[50], Vc2[2]); + Vc3[0] = vec_nmsub(VbS3, Va[48], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[49], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[50], Vc3[2]); + Vc4[0] = vec_nmsub(VbS4, Va[48], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[49], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[50], Vc4[2]); + Vc5[0] = vec_nmsub(VbS5, Va[48], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[49], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[50], Vc5[2]); + Vc6[0] = vec_nmsub(VbS6, Va[48], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[49], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[50], Vc6[2]); + Vc7[0] = vec_nmsub(VbS7, Va[48], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[49], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[50], Vc7[2]); + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[0] = vec_nmsub(VbS0, Va[44], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[45], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[44], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[45], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[44], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[45], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[44], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[45], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[44], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[45], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[44], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[45], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[44], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[45], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[44], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[45], Vc7[1]); + c0[ 8] -= b[88] * a[184]; + c0[ 9] -= b[88] * a[185]; + c0[10] -= b[88] * a[186]; + c1[ 8] -= b[89] * a[184]; + c1[ 9] -= b[89] * a[185]; + c1[10] -= b[89] * a[186]; + c2[ 8] -= b[90] * a[184]; + c2[ 9] -= b[90] * a[185]; + c2[10] -= b[90] * a[186]; + c3[ 8] -= b[91] * a[184]; + c3[ 9] -= b[91] * a[185]; + c3[10] -= b[91] * a[186]; + c4[ 8] -= b[92] * a[184]; + c4[ 9] -= b[92] * a[185]; + c4[10] -= b[92] * a[186]; + c5[ 8] -= b[93] * a[184]; + c5[ 9] -= b[93] * a[185]; + c5[10] -= b[93] * a[186]; + c6[ 8] -= b[94] * a[184]; + c6[ 9] -= b[94] * a[185]; + c6[10] -= b[94] * a[186]; + c7[ 8] -= b[95] * a[184]; + c7[ 9] -= b[95] * a[185]; + c7[10] -= b[95] * a[186]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[0] = vec_nmsub(VbS0, Va[40], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[41], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[40], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[41], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[40], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[41], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[40], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[41], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[40], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[41], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[40], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[41], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[40], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[41], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[40], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[41], Vc7[1]); + c0[8] -= b[80] * a[168]; + c0[9] -= b[80] * a[169]; + c1[8] -= b[81] * a[168]; + c1[9] -= b[81] * a[169]; + c2[8] -= b[82] * a[168]; + c2[9] -= b[82] * a[169]; + c3[8] -= b[83] * a[168]; + c3[9] -= b[83] * a[169]; + c4[8] -= b[84] * a[168]; + c4[9] -= b[84] * a[169]; + c5[8] -= b[85] * a[168]; + c5[9] -= b[85] * a[169]; + c6[8] -= b[86] * a[168]; + c6[9] -= b[86] * a[169]; + c7[8] -= b[87] * a[168]; + c7[9] -= b[87] * a[169]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[0] = vec_nmsub(VbS0, Va[36], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[37], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[36], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[37], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[36], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[37], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[36], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[37], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[36], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[37], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[36], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[37], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[36], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[37], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[36], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[37], Vc7[1]); + c0[8] -= b[72] * a[152]; + c1[8] -= b[73] * a[152]; + c2[8] -= b[74] * a[152]; + c3[8] -= b[75] * a[152]; + c4[8] -= b[76] * a[152]; + c5[8] -= b[77] * a[152]; + c6[8] -= b[78] * a[152]; + c7[8] -= b[79] * a[152]; + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[0] = vec_nmsub(VbS0, Va[32], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[33], Vc0[1]); + Vc1[0] = vec_nmsub(VbS1, Va[32], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[33], Vc1[1]); + Vc2[0] = vec_nmsub(VbS2, Va[32], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[33], Vc2[1]); + Vc3[0] = vec_nmsub(VbS3, Va[32], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[33], Vc3[1]); + Vc4[0] = vec_nmsub(VbS4, Va[32], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[33], Vc4[1]); + Vc5[0] = vec_nmsub(VbS5, Va[32], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[33], Vc5[1]); + Vc6[0] = vec_nmsub(VbS6, Va[32], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[33], Vc6[1]); + Vc7[0] = vec_nmsub(VbS7, Va[32], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[33], Vc7[1]); + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[28], Vc7[0]); + c0[4] -= b[56] * a[116]; + c0[5] -= b[56] * a[117]; + c0[6] -= b[56] * a[118]; + c1[4] -= b[57] * a[116]; + c1[5] -= b[57] * a[117]; + c1[6] -= b[57] * a[118]; + c2[4] -= b[58] * a[116]; + c2[5] -= b[58] * a[117]; + c2[6] -= b[58] * a[118]; + c3[4] -= b[59] * a[116]; + c3[5] -= b[59] * a[117]; + c3[6] -= b[59] * a[118]; + c4[4] -= b[60] * a[116]; + c4[5] -= b[60] * a[117]; + c4[6] -= b[60] * a[118]; + c5[4] -= b[61] * a[116]; + c5[5] -= b[61] * a[117]; + c5[6] -= b[61] * a[118]; + c6[4] -= b[62] * a[116]; + c6[5] -= b[62] * a[117]; + c6[6] -= b[62] * a[118]; + c7[4] -= b[63] * a[116]; + c7[5] -= b[63] * a[117]; + c7[6] -= b[63] * a[118]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[24], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[24], Vc7[0]); + c0[4] -= b[48] * a[100]; + c0[5] -= b[48] * a[101]; + c1[4] -= b[49] * a[100]; + c1[5] -= b[49] * a[101]; + c2[4] -= b[50] * a[100]; + c2[5] -= b[50] * a[101]; + c3[4] -= b[51] * a[100]; + c3[5] -= b[51] * a[101]; + c4[4] -= b[52] * a[100]; + c4[5] -= b[52] * a[101]; + c5[4] -= b[53] * a[100]; + c5[5] -= b[53] * a[101]; + c6[4] -= b[54] * a[100]; + c6[5] -= b[54] * a[101]; + c7[4] -= b[55] * a[100]; + c7[5] -= b[55] * a[101]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[20], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[20], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[20], Vc7[0]); + c0[4] -= b[40] * a[84]; + c1[4] -= b[41] * a[84]; + c2[4] -= b[42] * a[84]; + c3[4] -= b[43] * a[84]; + c4[4] -= b[44] * a[84]; + c5[4] -= b[45] * a[84]; + c6[4] -= b[46] * a[84]; + c7[4] -= b[47] * a[84]; + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc4[0] = vec_nmsub(VbS4, Va[16], Vc4[0]); + Vc5[0] = vec_nmsub(VbS5, Va[16], Vc5[0]); + Vc6[0] = vec_nmsub(VbS6, Va[16], Vc6[0]); + Vc7[0] = vec_nmsub(VbS7, Va[16], Vc7[0]); + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + c0[0] -= b[24] * a[48]; + c0[1] -= b[24] * a[49]; + c0[2] -= b[24] * a[50]; + c1[0] -= b[25] * a[48]; + c1[1] -= b[25] * a[49]; + c1[2] -= b[25] * a[50]; + c2[0] -= b[26] * a[48]; + c2[1] -= b[26] * a[49]; + c2[2] -= b[26] * a[50]; + c3[0] -= b[27] * a[48]; + c3[1] -= b[27] * a[49]; + c3[2] -= b[27] * a[50]; + c4[0] -= b[28] * a[48]; + c4[1] -= b[28] * a[49]; + c4[2] -= b[28] * a[50]; + c5[0] -= b[29] * a[48]; + c5[1] -= b[29] * a[49]; + c5[2] -= b[29] * a[50]; + c6[0] -= b[30] * a[48]; + c6[1] -= b[30] * a[49]; + c6[2] -= b[30] * a[50]; + c7[0] -= b[31] * a[48]; + c7[1] -= b[31] * a[49]; + c7[2] -= b[31] * a[50]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + c0[0] -= b[16] * a[32]; + c0[1] -= b[16] * a[33]; + c1[0] -= b[17] * a[32]; + c1[1] -= b[17] * a[33]; + c2[0] -= b[18] * a[32]; + c2[1] -= b[18] * a[33]; + c3[0] -= b[19] * a[32]; + c3[1] -= b[19] * a[33]; + c4[0] -= b[20] * a[32]; + c4[1] -= b[20] * a[33]; + c5[0] -= b[21] * a[32]; + c5[1] -= b[21] * a[33]; + c6[0] -= b[22] * a[32]; + c6[1] -= b[22] * a[33]; + c7[0] -= b[23] * a[32]; + c7[1] -= b[23] * a[33]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + c0[0] -= b[ 8] * a[16]; + c1[0] -= b[ 9] * a[16]; + c2[0] -= b[10] * a[16]; + c3[0] -= b[11] * a[16]; + c4[0] -= b[12] * a[16]; + c5[0] -= b[13] * a[16]; + c6[0] -= b[14] * a[16]; + c7[0] -= b[15] * a[16]; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (m - 1) * m; + b += (m - 1) * n; + + for (i = m - 1; i >= 0; i--) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = 0; k < i; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a -= m; + b -= 2 * n; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + a += (m - 1) * m * 2; + b += (m - 1) * n * 2; + + for (i = m - 1; i >= 0; i--) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= - cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a -= m * 2; + b -= 4 * n; + } + +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM KERNEL LN : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + j = (n >> GEMM_UNROLL_N_SHIFT); + + while (j > 0) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = m + offset; + + if (m & (GEMM_UNROLL_M - 1)) { + for (i = 1; i < GEMM_UNROLL_M; i *= 2){ + if (m & i) { + aa = a + ((m & ~(i - 1)) - i) * k * COMPSIZE; + cc = c + ((m & ~(i - 1)) - i) * COMPSIZE; + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - i) * i * COMPSIZE, + b + (kk - i) * j * COMPSIZE, + cc, ldc); + + kk -= i; + } + } + } + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + aa = a + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * k * COMPSIZE; + cc = c + ((m & ~(GEMM_UNROLL_M - 1)) - GEMM_UNROLL_M) * COMPSIZE; + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - GEMM_UNROLL_M) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_M) * j * COMPSIZE, + cc, ldc); + + aa -= GEMM_UNROLL_M * k * COMPSIZE; + cc -= GEMM_UNROLL_M * COMPSIZE; + kk -= GEMM_UNROLL_M; + i --; + } while (i > 0); + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_LT_power10.c b/kernel/power/trsm_kernel_LT_power10.c new file mode 100644 index 000000000..14ff12fe4 --- /dev/null +++ b/kernel/power/trsm_kernel_LT_power10.c @@ -0,0 +1,1265 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_L +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[1], 0); + VbS3 = vec_splat(Vb[1], 1); + VbS4 = vec_splat(Vb[2], 0); + VbS5 = vec_splat(Vb[2], 1); + VbS6 = vec_splat(Vb[3], 0); + VbS7 = vec_splat(Vb[3], 1); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= c0[0] * a[1]; + c1[1] -= c1[0] * a[1]; + c2[1] -= c2[0] * a[1]; + c3[1] -= c3[0] * a[1]; + c4[1] -= c4[0] * a[1]; + c5[1] -= c5[0] * a[1]; + c6[1] -= c6[0] * a[1]; + c7[1] -= c7[0] * a[1]; + + b[ 8] = (c0[1] *= a[9]); + b[ 9] = (c1[1] *= a[9]); + b[10] = (c2[1] *= a[9]); + b[11] = (c3[1] *= a[9]); + b[12] = (c4[1] *= a[9]); + b[13] = (c5[1] *= a[9]); + b[14] = (c6[1] *= a[9]); + b[15] = (c7[1] *= a[9]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[6], 0); + VbS5 = vec_splat(Vb[6], 1); + VbS6 = vec_splat(Vb[7], 0); + VbS7 = vec_splat(Vb[7], 1); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + + b[16] = (c0[2] *= a[18]); + b[17] = (c1[2] *= a[18]); + b[18] = (c2[2] *= a[18]); + b[19] = (c3[2] *= a[18]); + b[20] = (c4[2] *= a[18]); + b[21] = (c5[2] *= a[18]); + b[22] = (c6[2] *= a[18]); + b[23] = (c7[2] *= a[18]); + VbS0 = vec_splat(Vb[ 8], 0); + VbS1 = vec_splat(Vb[ 8], 1); + VbS2 = vec_splat(Vb[ 9], 0); + VbS3 = vec_splat(Vb[ 9], 1); + VbS4 = vec_splat(Vb[10], 0); + VbS5 = vec_splat(Vb[10], 1); + VbS6 = vec_splat(Vb[11], 0); + VbS7 = vec_splat(Vb[11], 1); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= c0[2] * a[19]; + c1[3] -= c1[2] * a[19]; + c2[3] -= c2[2] * a[19]; + c3[3] -= c3[2] * a[19]; + c4[3] -= c4[2] * a[19]; + c5[3] -= c5[2] * a[19]; + c6[3] -= c6[2] * a[19]; + c7[3] -= c7[2] * a[19]; + + b[24] = (c0[3] *= a[27]); + b[25] = (c1[3] *= a[27]); + b[26] = (c2[3] *= a[27]); + b[27] = (c3[3] *= a[27]); + b[28] = (c4[3] *= a[27]); + b[29] = (c5[3] *= a[27]); + b[30] = (c6[3] *= a[27]); + b[31] = (c7[3] *= a[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + VbS3 = vec_splat(Vb[13], 1); + VbS4 = vec_splat(Vb[14], 0); + VbS5 = vec_splat(Vb[14], 1); + VbS6 = vec_splat(Vb[15], 0); + VbS7 = vec_splat(Vb[15], 1); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[36]); + b[33] = (c1[4] *= a[36]); + b[34] = (c2[4] *= a[36]); + b[35] = (c3[4] *= a[36]); + b[36] = (c4[4] *= a[36]); + b[37] = (c5[4] *= a[36]); + b[38] = (c6[4] *= a[36]); + b[39] = (c7[4] *= a[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + VbS4 = vec_splat(Vb[18], 0); + VbS5 = vec_splat(Vb[18], 1); + VbS6 = vec_splat(Vb[19], 0); + VbS7 = vec_splat(Vb[19], 1); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= c0[4] * a[37]; + c1[5] -= c1[4] * a[37]; + c2[5] -= c2[4] * a[37]; + c3[5] -= c3[4] * a[37]; + c4[5] -= c4[4] * a[37]; + c5[5] -= c5[4] * a[37]; + c6[5] -= c6[4] * a[37]; + c7[5] -= c7[4] * a[37]; + + b[40] = (c0[5] *= a[45]); + b[41] = (c1[5] *= a[45]); + b[42] = (c2[5] *= a[45]); + b[43] = (c3[5] *= a[45]); + b[44] = (c4[5] *= a[45]); + b[45] = (c5[5] *= a[45]); + b[46] = (c6[5] *= a[45]); + b[47] = (c7[5] *= a[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + VbS5 = vec_splat(Vb[22], 1); + VbS6 = vec_splat(Vb[23], 0); + VbS7 = vec_splat(Vb[23], 1); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + + b[48] = (c0[6] *= a[54]); + b[49] = (c1[6] *= a[54]); + b[50] = (c2[6] *= a[54]); + b[51] = (c3[6] *= a[54]); + b[52] = (c4[6] *= a[54]); + b[53] = (c5[6] *= a[54]); + b[54] = (c6[6] *= a[54]); + b[55] = (c7[6] *= a[54]); + c0[7] -= c0[6] * a[55]; + c1[7] -= c1[6] * a[55]; + c2[7] -= c2[6] * a[55]; + c3[7] -= c3[6] * a[55]; + c4[7] -= c4[6] * a[55]; + c5[7] -= c5[6] * a[55]; + c6[7] -= c6[6] * a[55]; + c7[7] -= c7[6] * a[55]; + + b[56] = (c0[7] *= a[63]); + b[57] = (c1[7] *= a[63]); + b[58] = (c2[7] *= a[63]); + b[59] = (c3[7] *= a[63]); + b[60] = (c4[7] *= a[63]); + b[61] = (c5[7] *= a[63]); + b[62] = (c6[7] *= a[63]); + b[63] = (c7[7] *= a[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + int j; + + b[0] = (c0[0] *= a[0]); + b[1] = (c1[0] *= a[0]); + b[2] = (c2[0] *= a[0]); + b[3] = (c3[0] *= a[0]); + b[4] = (c4[0] *= a[0]); + b[5] = (c5[0] *= a[0]); + b[6] = (c6[0] *= a[0]); + b[7] = (c7[0] *= a[0]); + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + Vc0[1] = vec_nmsub(VbS0, Va[1], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[2], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[3], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + c0[1] -= b[0] * a[ 1]; + c0[2] -= b[0] * a[ 2]; + c0[3] -= b[0] * a[ 3]; + c1[1] -= b[1] * a[ 1]; + c1[2] -= b[1] * a[ 2]; + c1[3] -= b[1] * a[ 3]; + c2[1] -= b[2] * a[ 1]; + c2[2] -= b[2] * a[ 2]; + c2[3] -= b[2] * a[ 3]; + c3[1] -= b[3] * a[ 1]; + c3[2] -= b[3] * a[ 2]; + c3[3] -= b[3] * a[ 3]; + c4[1] -= b[4] * a[ 1]; + c4[2] -= b[4] * a[ 2]; + c4[3] -= b[4] * a[ 3]; + c5[1] -= b[5] * a[ 1]; + c5[2] -= b[5] * a[ 2]; + c5[3] -= b[5] * a[ 3]; + c6[1] -= b[6] * a[ 1]; + c6[2] -= b[6] * a[ 2]; + c6[3] -= b[6] * a[ 3]; + c7[1] -= b[7] * a[ 1]; + c7[2] -= b[7] * a[ 2]; + c7[3] -= b[7] * a[ 3]; + + b[ 8] = (c0[1] *= a[17]); + b[ 9] = (c1[1] *= a[17]); + b[10] = (c2[1] *= a[17]); + b[11] = (c3[1] *= a[17]); + b[12] = (c4[1] *= a[17]); + b[13] = (c5[1] *= a[17]); + b[14] = (c6[1] *= a[17]); + b[15] = (c7[1] *= a[17]); + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + VbS2 = vec_splat(Vb[2], 2); + VbS3 = vec_splat(Vb[2], 3); + VbS4 = vec_splat(Vb[3], 0); + VbS5 = vec_splat(Vb[3], 1); + VbS6 = vec_splat(Vb[3], 2); + VbS7 = vec_splat(Vb[3], 3); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[5], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[6], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[7], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[7], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[7], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[7], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[7], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[7], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[7], Vc7[3]); + c0[2] -= b[ 8] * a[18]; + c0[3] -= b[ 8] * a[19]; + c1[2] -= b[ 9] * a[18]; + c1[3] -= b[ 9] * a[19]; + c2[2] -= b[10] * a[18]; + c2[3] -= b[10] * a[19]; + c3[2] -= b[11] * a[18]; + c3[3] -= b[11] * a[19]; + c4[2] -= b[12] * a[18]; + c4[3] -= b[12] * a[19]; + c5[2] -= b[13] * a[18]; + c5[3] -= b[13] * a[19]; + c6[2] -= b[14] * a[18]; + c6[3] -= b[14] * a[19]; + c7[2] -= b[15] * a[18]; + c7[3] -= b[15] * a[19]; + + b[16] = (c0[2] *= a[34]); + b[17] = (c1[2] *= a[34]); + b[18] = (c2[2] *= a[34]); + b[19] = (c3[2] *= a[34]); + b[20] = (c4[2] *= a[34]); + b[21] = (c5[2] *= a[34]); + b[22] = (c6[2] *= a[34]); + b[23] = (c7[2] *= a[34]); + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + VbS3 = vec_splat(Vb[4], 3); + VbS4 = vec_splat(Vb[5], 0); + VbS5 = vec_splat(Vb[5], 1); + VbS6 = vec_splat(Vb[5], 2); + VbS7 = vec_splat(Vb[5], 3); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[ 9], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[10], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[11], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[11], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[11], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[11], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[11], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[11], Vc7[3]); + c0[3] -= b[16] * a[35]; + c1[3] -= b[17] * a[35]; + c2[3] -= b[18] * a[35]; + c3[3] -= b[19] * a[35]; + c4[3] -= b[20] * a[35]; + c5[3] -= b[21] * a[35]; + c6[3] -= b[22] * a[35]; + c7[3] -= b[23] * a[35]; + + b[24] = (c0[3] *= a[51]); + b[25] = (c1[3] *= a[51]); + b[26] = (c2[3] *= a[51]); + b[27] = (c3[3] *= a[51]); + b[28] = (c4[3] *= a[51]); + b[29] = (c5[3] *= a[51]); + b[30] = (c6[3] *= a[51]); + b[31] = (c7[3] *= a[51]); + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + VbS6 = vec_splat(Vb[7], 2); + VbS7 = vec_splat(Vb[7], 3); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + Vc3[1] = vec_nmsub(VbS3, Va[13], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[14], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[15], Vc3[3]); + Vc4[1] = vec_nmsub(VbS4, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[15], Vc4[3]); + Vc5[1] = vec_nmsub(VbS5, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[15], Vc5[3]); + Vc6[1] = vec_nmsub(VbS6, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[15], Vc6[3]); + Vc7[1] = vec_nmsub(VbS7, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[15], Vc7[3]); + + b[32] = (c0[4] *= a[68]); + b[33] = (c1[4] *= a[68]); + b[34] = (c2[4] *= a[68]); + b[35] = (c3[4] *= a[68]); + b[36] = (c4[4] *= a[68]); + b[37] = (c5[4] *= a[68]); + b[38] = (c6[4] *= a[68]); + b[39] = (c7[4] *= a[68]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + VbS5 = vec_splat(Vb[9], 1); + VbS6 = vec_splat(Vb[9], 2); + VbS7 = vec_splat(Vb[9], 3); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[18], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[19], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[19], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[19], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[19], Vc7[3]); + c0[5] -= b[32] * a[69]; + c0[6] -= b[32] * a[70]; + c0[7] -= b[32] * a[71]; + c1[5] -= b[33] * a[69]; + c1[6] -= b[33] * a[70]; + c1[7] -= b[33] * a[71]; + c2[5] -= b[34] * a[69]; + c2[6] -= b[34] * a[70]; + c2[7] -= b[34] * a[71]; + c3[5] -= b[35] * a[69]; + c3[6] -= b[35] * a[70]; + c3[7] -= b[35] * a[71]; + c4[5] -= b[36] * a[69]; + c4[6] -= b[36] * a[70]; + c4[7] -= b[36] * a[71]; + c5[5] -= b[37] * a[69]; + c5[6] -= b[37] * a[70]; + c5[7] -= b[37] * a[71]; + c6[5] -= b[38] * a[69]; + c6[6] -= b[38] * a[70]; + c6[7] -= b[38] * a[71]; + c7[5] -= b[39] * a[69]; + c7[6] -= b[39] * a[70]; + c7[7] -= b[39] * a[71]; + + b[40] = (c0[5] *= a[85]); + b[41] = (c1[5] *= a[85]); + b[42] = (c2[5] *= a[85]); + b[43] = (c3[5] *= a[85]); + b[44] = (c4[5] *= a[85]); + b[45] = (c5[5] *= a[85]); + b[46] = (c6[5] *= a[85]); + b[47] = (c7[5] *= a[85]); + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + VbS6 = vec_splat(Vb[11], 2); + VbS7 = vec_splat(Vb[11], 3); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[22], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[23], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[23], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[23], Vc7[3]); + c0[6] -= b[40] * a[86]; + c0[7] -= b[40] * a[87]; + c1[6] -= b[41] * a[86]; + c1[7] -= b[41] * a[87]; + c2[6] -= b[42] * a[86]; + c2[7] -= b[42] * a[87]; + c3[6] -= b[43] * a[86]; + c3[7] -= b[43] * a[87]; + c4[6] -= b[44] * a[86]; + c4[7] -= b[44] * a[87]; + c5[6] -= b[45] * a[86]; + c5[7] -= b[45] * a[87]; + c6[6] -= b[46] * a[86]; + c6[7] -= b[46] * a[87]; + c7[6] -= b[47] * a[86]; + c7[7] -= b[47] * a[87]; + + b[48] = (c0[6] *= a[102]); + b[49] = (c1[6] *= a[102]); + b[50] = (c2[6] *= a[102]); + b[51] = (c3[6] *= a[102]); + b[52] = (c4[6] *= a[102]); + b[53] = (c5[6] *= a[102]); + b[54] = (c6[6] *= a[102]); + b[55] = (c7[6] *= a[102]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + VbS7 = vec_splat(Vb[13], 3); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[26], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[27], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[27], Vc7[3]); + c0[7] -= b[48] * a[103]; + c1[7] -= b[49] * a[103]; + c2[7] -= b[50] * a[103]; + c3[7] -= b[51] * a[103]; + c4[7] -= b[52] * a[103]; + c5[7] -= b[53] * a[103]; + c6[7] -= b[54] * a[103]; + c7[7] -= b[55] * a[103]; + + b[56] = (c0[7] *= a[119]); + b[57] = (c1[7] *= a[119]); + b[58] = (c2[7] *= a[119]); + b[59] = (c3[7] *= a[119]); + b[60] = (c4[7] *= a[119]); + b[61] = (c5[7] *= a[119]); + b[62] = (c6[7] *= a[119]); + b[63] = (c7[7] *= a[119]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + Vc7[2] = vec_nmsub(VbS7, Va[30], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[31], Vc7[3]); + + b[64] = (c0[8] *= a[136]); + b[65] = (c1[8] *= a[136]); + b[66] = (c2[8] *= a[136]); + b[67] = (c3[8] *= a[136]); + b[68] = (c4[8] *= a[136]); + b[69] = (c5[8] *= a[136]); + b[70] = (c6[8] *= a[136]); + b[71] = (c7[8] *= a[136]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[16], 2); + VbS3 = vec_splat(Vb[16], 3); + VbS4 = vec_splat(Vb[17], 0); + VbS5 = vec_splat(Vb[17], 1); + VbS6 = vec_splat(Vb[17], 2); + VbS7 = vec_splat(Vb[17], 3); + Vc0[3] = vec_nmsub(VbS0, Va[35], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[35], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[35], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[35], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[35], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[35], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[35], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[35], Vc7[3]); + c0[ 9] -= b[64] * a[137]; + c0[10] -= b[64] * a[138]; + c0[11] -= b[64] * a[139]; + c1[ 9] -= b[65] * a[137]; + c1[10] -= b[65] * a[138]; + c1[11] -= b[65] * a[139]; + c2[ 9] -= b[66] * a[137]; + c2[10] -= b[66] * a[138]; + c2[11] -= b[66] * a[139]; + c3[ 9] -= b[67] * a[137]; + c3[10] -= b[67] * a[138]; + c3[11] -= b[67] * a[139]; + c4[ 9] -= b[68] * a[137]; + c4[10] -= b[68] * a[138]; + c4[11] -= b[68] * a[139]; + c5[ 9] -= b[69] * a[137]; + c5[10] -= b[69] * a[138]; + c5[11] -= b[69] * a[139]; + c6[ 9] -= b[70] * a[137]; + c6[10] -= b[70] * a[138]; + c6[11] -= b[70] * a[139]; + c7[ 9] -= b[71] * a[137]; + c7[10] -= b[71] * a[138]; + c7[11] -= b[71] * a[139]; + + b[72] = (c0[9] *= a[153]); + b[73] = (c1[9] *= a[153]); + b[74] = (c2[9] *= a[153]); + b[75] = (c3[9] *= a[153]); + b[76] = (c4[9] *= a[153]); + b[77] = (c5[9] *= a[153]); + b[78] = (c6[9] *= a[153]); + b[79] = (c7[9] *= a[153]); + VbS0 = vec_splat(Vb[18], 0); + VbS1 = vec_splat(Vb[18], 1); + VbS2 = vec_splat(Vb[18], 2); + VbS3 = vec_splat(Vb[18], 3); + VbS4 = vec_splat(Vb[19], 0); + VbS5 = vec_splat(Vb[19], 1); + VbS6 = vec_splat(Vb[19], 2); + VbS7 = vec_splat(Vb[19], 3); + Vc0[3] = vec_nmsub(VbS0, Va[39], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[39], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[39], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[39], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[39], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[39], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[39], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[39], Vc7[3]); + c0[10] -= b[72] * a[154]; + c0[11] -= b[72] * a[155]; + c1[10] -= b[73] * a[154]; + c1[11] -= b[73] * a[155]; + c2[10] -= b[74] * a[154]; + c2[11] -= b[74] * a[155]; + c3[10] -= b[75] * a[154]; + c3[11] -= b[75] * a[155]; + c4[10] -= b[76] * a[154]; + c4[11] -= b[76] * a[155]; + c5[10] -= b[77] * a[154]; + c5[11] -= b[77] * a[155]; + c6[10] -= b[78] * a[154]; + c6[11] -= b[78] * a[155]; + c7[10] -= b[79] * a[154]; + c7[11] -= b[79] * a[155]; + + b[80] = (c0[10] *= a[170]); + b[81] = (c1[10] *= a[170]); + b[82] = (c2[10] *= a[170]); + b[83] = (c3[10] *= a[170]); + b[84] = (c4[10] *= a[170]); + b[85] = (c5[10] *= a[170]); + b[86] = (c6[10] *= a[170]); + b[87] = (c7[10] *= a[170]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[20], 2); + VbS3 = vec_splat(Vb[20], 3); + VbS4 = vec_splat(Vb[21], 0); + VbS5 = vec_splat(Vb[21], 1); + VbS6 = vec_splat(Vb[21], 2); + VbS7 = vec_splat(Vb[21], 3); + Vc0[3] = vec_nmsub(VbS0, Va[43], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[43], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[43], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[43], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[43], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[43], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[43], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[43], Vc7[3]); + c0[11] -= b[80] * a[171]; + c1[11] -= b[81] * a[171]; + c2[11] -= b[82] * a[171]; + c3[11] -= b[83] * a[171]; + c4[11] -= b[84] * a[171]; + c5[11] -= b[85] * a[171]; + c6[11] -= b[86] * a[171]; + c7[11] -= b[87] * a[171]; + + b[88] = (c0[11] *= a[187]); + b[89] = (c1[11] *= a[187]); + b[90] = (c2[11] *= a[187]); + b[91] = (c3[11] *= a[187]); + b[92] = (c4[11] *= a[187]); + b[93] = (c5[11] *= a[187]); + b[94] = (c6[11] *= a[187]); + b[95] = (c7[11] *= a[187]); + VbS0 = vec_splat(Vb[22], 0); + VbS1 = vec_splat(Vb[22], 1); + VbS2 = vec_splat(Vb[22], 2); + VbS3 = vec_splat(Vb[22], 3); + VbS4 = vec_splat(Vb[23], 0); + VbS5 = vec_splat(Vb[23], 1); + VbS6 = vec_splat(Vb[23], 2); + VbS7 = vec_splat(Vb[23], 3); + Vc0[3] = vec_nmsub(VbS0, Va[47], Vc0[3]); + Vc1[3] = vec_nmsub(VbS1, Va[47], Vc1[3]); + Vc2[3] = vec_nmsub(VbS2, Va[47], Vc2[3]); + Vc3[3] = vec_nmsub(VbS3, Va[47], Vc3[3]); + Vc4[3] = vec_nmsub(VbS4, Va[47], Vc4[3]); + Vc5[3] = vec_nmsub(VbS5, Va[47], Vc5[3]); + Vc6[3] = vec_nmsub(VbS6, Va[47], Vc6[3]); + Vc7[3] = vec_nmsub(VbS7, Va[47], Vc7[3]); + + b[ 96] = (c0[12] *= a[204]); + b[ 97] = (c1[12] *= a[204]); + b[ 98] = (c2[12] *= a[204]); + b[ 99] = (c3[12] *= a[204]); + b[100] = (c4[12] *= a[204]); + b[101] = (c5[12] *= a[204]); + b[102] = (c6[12] *= a[204]); + b[103] = (c7[12] *= a[204]); + c0[13] -= b[ 96] * a[205]; + c0[14] -= b[ 96] * a[206]; + c0[15] -= b[ 96] * a[207]; + c1[13] -= b[ 97] * a[205]; + c1[14] -= b[ 97] * a[206]; + c1[15] -= b[ 97] * a[207]; + c2[13] -= b[ 98] * a[205]; + c2[14] -= b[ 98] * a[206]; + c2[15] -= b[ 98] * a[207]; + c3[13] -= b[ 99] * a[205]; + c3[14] -= b[ 99] * a[206]; + c3[15] -= b[ 99] * a[207]; + c4[13] -= b[100] * a[205]; + c4[14] -= b[100] * a[206]; + c4[15] -= b[100] * a[207]; + c5[13] -= b[101] * a[205]; + c5[14] -= b[101] * a[206]; + c5[15] -= b[101] * a[207]; + c6[13] -= b[102] * a[205]; + c6[14] -= b[102] * a[206]; + c6[15] -= b[102] * a[207]; + c7[13] -= b[103] * a[205]; + c7[14] -= b[103] * a[206]; + c7[15] -= b[103] * a[207]; + + b[104] = (c0[13] *= a[221]); + b[105] = (c1[13] *= a[221]); + b[106] = (c2[13] *= a[221]); + b[107] = (c3[13] *= a[221]); + b[108] = (c4[13] *= a[221]); + b[109] = (c5[13] *= a[221]); + b[110] = (c6[13] *= a[221]); + b[111] = (c7[13] *= a[221]); + c0[14] -= b[104] * a[222]; + c0[15] -= b[104] * a[223]; + c1[14] -= b[105] * a[222]; + c1[15] -= b[105] * a[223]; + c2[14] -= b[106] * a[222]; + c2[15] -= b[106] * a[223]; + c3[14] -= b[107] * a[222]; + c3[15] -= b[107] * a[223]; + c4[14] -= b[108] * a[222]; + c4[15] -= b[108] * a[223]; + c5[14] -= b[109] * a[222]; + c5[15] -= b[109] * a[223]; + c6[14] -= b[110] * a[222]; + c6[15] -= b[110] * a[223]; + c7[14] -= b[111] * a[222]; + c7[15] -= b[111] * a[223]; + + b[112] = (c0[14] *= a[238]); + b[113] = (c1[14] *= a[238]); + b[114] = (c2[14] *= a[238]); + b[115] = (c3[14] *= a[238]); + b[116] = (c4[14] *= a[238]); + b[117] = (c5[14] *= a[238]); + b[118] = (c6[14] *= a[238]); + b[119] = (c7[14] *= a[238]); + c0[15] -= b[112] * a[239]; + c1[15] -= b[113] * a[239]; + c2[15] -= b[114] * a[239]; + c3[15] -= b[115] * a[239]; + c4[15] -= b[116] * a[239]; + c5[15] -= b[117] * a[239]; + c6[15] -= b[118] * a[239]; + c7[15] -= b[119] * a[239]; + + b[120] = (c0[15] *= a[255]); + b[121] = (c1[15] *= a[255]); + b[122] = (c2[15] *= a[255]); + b[123] = (c3[15] *= a[255]); + b[124] = (c4[15] *= a[255]); + b[125] = (c5[15] *= a[255]); + b[126] = (c6[15] *= a[255]); + b[127] = (c7[15] *= a[255]); +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < m; i++) { + + aa = *(a + i); + + for (j = 0; j < n; j ++) { + bb = *(c + i + j * ldc); + bb *= aa; + *b = bb; + *(c + i + j * ldc) = bb; + b ++; + + for (k = i + 1; k < m; k ++){ + *(c + k + j * ldc) -= bb * *(a + k); + } + + } + a += m; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < m; i++) { + + aa1 = *(a + i * 2 + 0); + aa2 = *(a + i * 2 + 1); + + for (j = 0; j < n; j ++) { + bb1 = *(c + i * 2 + 0 + j * ldc); + bb2 = *(c + i * 2 + 1 + j * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = aa1 * bb2 - aa2 * bb1; +#endif + + *(b + 0) = cc1; + *(b + 1) = cc2; + *(c + i * 2 + 0 + j * ldc) = cc1; + *(c + i * 2 + 1 + j * ldc) = cc2; + b += 2; + + for (k = i + 1; k < m; k ++){ +#ifndef CONJ + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) - cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#else + *(c + k * 2 + 0 + j * ldc) -= cc1 * *(a + k * 2 + 0) + cc2 * *(a + k * 2 + 1); + *(c + k * 2 + 1 + j * ldc) -= -cc1 * *(a + k * 2 + 1) + cc2 * *(a + k * 2 + 0); +#endif + } + + } + a += m * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM KERNEL LT : m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + + j = (n >> GEMM_UNROLL_N_SHIFT); + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + kk = offset; + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + kk += GEMM_UNROLL_M; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + kk += i; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RN_power10.c b/kernel/power/trsm_kernel_RN_power10.c new file mode 100644 index 000000000..92c26fcc3 --- /dev/null +++ b/kernel/power/trsm_kernel_RN_power10.c @@ -0,0 +1,828 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); + VbS0 = vec_splat(Vb[0], 1); + VbS1 = vec_splat(Vb[1], 0); + VbS2 = vec_splat(Vb[1], 1); + VbS3 = vec_splat(Vb[2], 0); + VbS4 = vec_splat(Vb[2], 1); + VbS5 = vec_splat(Vb[3], 0); + VbS6 = vec_splat(Vb[3], 1); + Vc1[0] = vec_nmsub(Vc0[ 0], VbS0, Vc1[0]); + Vc1[1] = vec_nmsub(Vc0[ 1], VbS0, Vc1[1]); + Vc1[2] = vec_nmsub(Vc0[ 2], VbS0, Vc1[2]); + Vc1[3] = vec_nmsub(Vc0[ 3], VbS0, Vc1[3]); + Vc2[0] = vec_nmsub(Vc0[ 0], VbS1, Vc2[0]); + Vc2[1] = vec_nmsub(Vc0[ 1], VbS1, Vc2[1]); + Vc2[2] = vec_nmsub(Vc0[ 2], VbS1, Vc2[2]); + Vc2[3] = vec_nmsub(Vc0[ 3], VbS1, Vc2[3]); + Vc3[0] = vec_nmsub(Vc0[ 0], VbS2, Vc3[0]); + Vc3[1] = vec_nmsub(Vc0[ 1], VbS2, Vc3[1]); + Vc3[2] = vec_nmsub(Vc0[ 2], VbS2, Vc3[2]); + Vc3[3] = vec_nmsub(Vc0[ 3], VbS2, Vc3[3]); + Vc4[0] = vec_nmsub(Vc0[ 0], VbS3, Vc4[0]); + Vc4[1] = vec_nmsub(Vc0[ 1], VbS3, Vc4[1]); + Vc4[2] = vec_nmsub(Vc0[ 2], VbS3, Vc4[2]); + Vc4[3] = vec_nmsub(Vc0[ 3], VbS3, Vc4[3]); + Vc5[0] = vec_nmsub(Vc0[ 0], VbS4, Vc5[0]); + Vc5[1] = vec_nmsub(Vc0[ 1], VbS4, Vc5[1]); + Vc5[2] = vec_nmsub(Vc0[ 2], VbS4, Vc5[2]); + Vc5[3] = vec_nmsub(Vc0[ 3], VbS4, Vc5[3]); + Vc6[0] = vec_nmsub(Vc0[ 0], VbS5, Vc6[0]); + Vc6[1] = vec_nmsub(Vc0[ 1], VbS5, Vc6[1]); + Vc6[2] = vec_nmsub(Vc0[ 2], VbS5, Vc6[2]); + Vc6[3] = vec_nmsub(Vc0[ 3], VbS5, Vc6[3]); + Vc7[0] = vec_nmsub(Vc0[ 0], VbS6, Vc7[0]); + Vc7[1] = vec_nmsub(Vc0[ 1], VbS6, Vc7[1]); + Vc7[2] = vec_nmsub(Vc0[ 2], VbS6, Vc7[2]); + Vc7[3] = vec_nmsub(Vc0[ 3], VbS6, Vc7[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[5], 0); + VbS1 = vec_splat(Vb[5], 1); + VbS2 = vec_splat(Vb[6], 0); + VbS3 = vec_splat(Vb[6], 1); + VbS4 = vec_splat(Vb[7], 0); + VbS5 = vec_splat(Vb[7], 1); + Vc2[0] = vec_nmsub(Vc1[0], VbS0, Vc2[0]); + Vc2[1] = vec_nmsub(Vc1[1], VbS0, Vc2[1]); + Vc2[2] = vec_nmsub(Vc1[2], VbS0, Vc2[2]); + Vc2[3] = vec_nmsub(Vc1[3], VbS0, Vc2[3]); + Vc3[0] = vec_nmsub(Vc1[0], VbS1, Vc3[0]); + Vc3[1] = vec_nmsub(Vc1[1], VbS1, Vc3[1]); + Vc3[2] = vec_nmsub(Vc1[2], VbS1, Vc3[2]); + Vc3[3] = vec_nmsub(Vc1[3], VbS1, Vc3[3]); + Vc4[0] = vec_nmsub(Vc1[0], VbS2, Vc4[0]); + Vc4[1] = vec_nmsub(Vc1[1], VbS2, Vc4[1]); + Vc4[2] = vec_nmsub(Vc1[2], VbS2, Vc4[2]); + Vc4[3] = vec_nmsub(Vc1[3], VbS2, Vc4[3]); + Vc5[0] = vec_nmsub(Vc1[0], VbS3, Vc5[0]); + Vc5[1] = vec_nmsub(Vc1[1], VbS3, Vc5[1]); + Vc5[2] = vec_nmsub(Vc1[2], VbS3, Vc5[2]); + Vc5[3] = vec_nmsub(Vc1[3], VbS3, Vc5[3]); + Vc6[0] = vec_nmsub(Vc1[0], VbS4, Vc6[0]); + Vc6[1] = vec_nmsub(Vc1[1], VbS4, Vc6[1]); + Vc6[2] = vec_nmsub(Vc1[2], VbS4, Vc6[2]); + Vc6[3] = vec_nmsub(Vc1[3], VbS4, Vc6[3]); + Vc7[0] = vec_nmsub(Vc1[0], VbS5, Vc7[0]); + Vc7[1] = vec_nmsub(Vc1[1], VbS5, Vc7[1]); + Vc7[2] = vec_nmsub(Vc1[2], VbS5, Vc7[2]); + Vc7[3] = vec_nmsub(Vc1[3], VbS5, Vc7[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[ 9], 1); + VbS1 = vec_splat(Vb[10], 0); + VbS2 = vec_splat(Vb[10], 1); + VbS3 = vec_splat(Vb[11], 0); + VbS4 = vec_splat(Vb[11], 1); + Vc3[0] = vec_nmsub(Vc2[0], VbS0, Vc3[0]); + Vc3[1] = vec_nmsub(Vc2[1], VbS0, Vc3[1]); + Vc3[2] = vec_nmsub(Vc2[2], VbS0, Vc3[2]); + Vc3[3] = vec_nmsub(Vc2[3], VbS0, Vc3[3]); + Vc4[0] = vec_nmsub(Vc2[0], VbS1, Vc4[0]); + Vc4[1] = vec_nmsub(Vc2[1], VbS1, Vc4[1]); + Vc4[2] = vec_nmsub(Vc2[2], VbS1, Vc4[2]); + Vc4[3] = vec_nmsub(Vc2[3], VbS1, Vc4[3]); + Vc5[0] = vec_nmsub(Vc2[0], VbS2, Vc5[0]); + Vc5[1] = vec_nmsub(Vc2[1], VbS2, Vc5[1]); + Vc5[2] = vec_nmsub(Vc2[2], VbS2, Vc5[2]); + Vc5[3] = vec_nmsub(Vc2[3], VbS2, Vc5[3]); + Vc6[0] = vec_nmsub(Vc2[0], VbS3, Vc6[0]); + Vc6[1] = vec_nmsub(Vc2[1], VbS3, Vc6[1]); + Vc6[2] = vec_nmsub(Vc2[2], VbS3, Vc6[2]); + Vc6[3] = vec_nmsub(Vc2[3], VbS3, Vc6[3]); + Vc7[0] = vec_nmsub(Vc2[0], VbS4, Vc7[0]); + Vc7[1] = vec_nmsub(Vc2[1], VbS4, Vc7[1]); + Vc7[2] = vec_nmsub(Vc2[2], VbS4, Vc7[2]); + Vc7[3] = vec_nmsub(Vc2[3], VbS4, Vc7[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[15], 0); + VbS3 = vec_splat(Vb[15], 1); + Vc4[0] = vec_nmsub(Vc3[0], VbS0, Vc4[0]); + Vc4[1] = vec_nmsub(Vc3[1], VbS0, Vc4[1]); + Vc4[2] = vec_nmsub(Vc3[2], VbS0, Vc4[2]); + Vc4[3] = vec_nmsub(Vc3[3], VbS0, Vc4[3]); + Vc5[0] = vec_nmsub(Vc3[0], VbS1, Vc5[0]); + Vc5[1] = vec_nmsub(Vc3[1], VbS1, Vc5[1]); + Vc5[2] = vec_nmsub(Vc3[2], VbS1, Vc5[2]); + Vc5[3] = vec_nmsub(Vc3[3], VbS1, Vc5[3]); + Vc6[0] = vec_nmsub(Vc3[0], VbS2, Vc6[0]); + Vc6[1] = vec_nmsub(Vc3[1], VbS2, Vc6[1]); + Vc6[2] = vec_nmsub(Vc3[2], VbS2, Vc6[2]); + Vc6[3] = vec_nmsub(Vc3[3], VbS2, Vc6[3]); + Vc7[0] = vec_nmsub(Vc3[0], VbS3, Vc7[0]); + Vc7[1] = vec_nmsub(Vc3[1], VbS3, Vc7[1]); + Vc7[2] = vec_nmsub(Vc3[2], VbS3, Vc7[2]); + Vc7[3] = vec_nmsub(Vc3[3], VbS3, Vc7[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[18], 1); + VbS1 = vec_splat(Vb[19], 0); + VbS2 = vec_splat(Vb[19], 1); + Vc5[0] = vec_nmsub(Vc4[0], VbS0, Vc5[0]); + Vc5[1] = vec_nmsub(Vc4[1], VbS0, Vc5[1]); + Vc5[2] = vec_nmsub(Vc4[2], VbS0, Vc5[2]); + Vc5[3] = vec_nmsub(Vc4[3], VbS0, Vc5[3]); + Vc6[0] = vec_nmsub(Vc4[0], VbS1, Vc6[0]); + Vc6[1] = vec_nmsub(Vc4[1], VbS1, Vc6[1]); + Vc6[2] = vec_nmsub(Vc4[2], VbS1, Vc6[2]); + Vc6[3] = vec_nmsub(Vc4[3], VbS1, Vc6[3]); + Vc7[0] = vec_nmsub(Vc4[0], VbS2, Vc7[0]); + Vc7[1] = vec_nmsub(Vc4[1], VbS2, Vc7[1]); + Vc7[2] = vec_nmsub(Vc4[2], VbS2, Vc7[2]); + Vc7[3] = vec_nmsub(Vc4[3], VbS2, Vc7[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[23], 0); + VbS1 = vec_splat(Vb[23], 1); + Vc6[0] = vec_nmsub(Vc5[0], VbS0, Vc6[0]); + Vc6[1] = vec_nmsub(Vc5[1], VbS0, Vc6[1]); + Vc6[2] = vec_nmsub(Vc5[2], VbS0, Vc6[2]); + Vc6[3] = vec_nmsub(Vc5[3], VbS0, Vc6[3]); + Vc7[0] = vec_nmsub(Vc5[0], VbS1, Vc7[0]); + Vc7[1] = vec_nmsub(Vc5[1], VbS1, Vc7[1]); + Vc7[2] = vec_nmsub(Vc5[2], VbS1, Vc7[2]); + Vc7[3] = vec_nmsub(Vc5[3], VbS1, Vc7[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[27], 1); + Vc7[0] = vec_nmsub(Vc6[0], VbS0, Vc7[0]); + Vc7[1] = vec_nmsub(Vc6[1], VbS0, Vc7[1]); + Vc7[2] = vec_nmsub(Vc6[2], VbS0, Vc7[2]); + Vc7[3] = vec_nmsub(Vc6[3], VbS0, Vc7[3]); + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[0], 0); + VbS1 = vec_splat(Vb[0], 1); + VbS2 = vec_splat(Vb[0], 2); + VbS3 = vec_splat(Vb[0], 3); + VbS4 = vec_splat(Vb[1], 0); + VbS5 = vec_splat(Vb[1], 1); + VbS6 = vec_splat(Vb[1], 2); + VbS7 = vec_splat(Vb[1], 3); + + Vc0[ 0] = vec_mul(VbS0, Vc0[ 0]); + Vc0[ 1] = vec_mul(VbS0, Vc0[ 1]); + Vc0[ 2] = vec_mul(VbS0, Vc0[ 2]); + Vc0[ 3] = vec_mul(VbS0, Vc0[ 3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; + Vc1[0] = vec_nmsub(VbS1, Va[0], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[1], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[2], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[3], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[0], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[1], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[2], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[3], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[0], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[1], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[2], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[3], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[0], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[1], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[2], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[3], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[0], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[1], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[2], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[3], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[0], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[1], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[2], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[3], Vc6[3]); + Vc7[0] = vec_nmsub(VbS7, Va[0], Vc7[0]); + Vc7[1] = vec_nmsub(VbS7, Va[1], Vc7[1]); + Vc7[2] = vec_nmsub(VbS7, Va[2], Vc7[2]); + Vc7[3] = vec_nmsub(VbS7, Va[3], Vc7[3]); + + VbS0 = vec_splat(Vb[2], 1); + VbS1 = vec_splat(Vb[2], 2); + VbS2 = vec_splat(Vb[2], 3); + VbS3 = vec_splat(Vb[3], 0); + VbS4 = vec_splat(Vb[3], 1); + VbS5 = vec_splat(Vb[3], 2); + VbS6 = vec_splat(Vb[3], 3); + + Vc1[0] = vec_mul(VbS0, Vc1[0]); + Vc1[1] = vec_mul(VbS0, Vc1[1]); + Vc1[2] = vec_mul(VbS0, Vc1[2]); + Vc1[3] = vec_mul(VbS0, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc2[0] = vec_nmsub(VbS1, Va[4], Vc2[0]); + Vc2[1] = vec_nmsub(VbS1, Va[5], Vc2[1]); + Vc2[2] = vec_nmsub(VbS1, Va[6], Vc2[2]); + Vc2[3] = vec_nmsub(VbS1, Va[7], Vc2[3]); + Vc3[0] = vec_nmsub(VbS2, Va[4], Vc3[0]); + Vc3[1] = vec_nmsub(VbS2, Va[5], Vc3[1]); + Vc3[2] = vec_nmsub(VbS2, Va[6], Vc3[2]); + Vc3[3] = vec_nmsub(VbS2, Va[7], Vc3[3]); + Vc4[0] = vec_nmsub(VbS3, Va[4], Vc4[0]); + Vc4[1] = vec_nmsub(VbS3, Va[5], Vc4[1]); + Vc4[2] = vec_nmsub(VbS3, Va[6], Vc4[2]); + Vc4[3] = vec_nmsub(VbS3, Va[7], Vc4[3]); + Vc5[0] = vec_nmsub(VbS4, Va[4], Vc5[0]); + Vc5[1] = vec_nmsub(VbS4, Va[5], Vc5[1]); + Vc5[2] = vec_nmsub(VbS4, Va[6], Vc5[2]); + Vc5[3] = vec_nmsub(VbS4, Va[7], Vc5[3]); + Vc6[0] = vec_nmsub(VbS5, Va[4], Vc6[0]); + Vc6[1] = vec_nmsub(VbS5, Va[5], Vc6[1]); + Vc6[2] = vec_nmsub(VbS5, Va[6], Vc6[2]); + Vc6[3] = vec_nmsub(VbS5, Va[7], Vc6[3]); + Vc7[0] = vec_nmsub(VbS6, Va[4], Vc7[0]); + Vc7[1] = vec_nmsub(VbS6, Va[5], Vc7[1]); + Vc7[2] = vec_nmsub(VbS6, Va[6], Vc7[2]); + Vc7[3] = vec_nmsub(VbS6, Va[7], Vc7[3]); + + VbS0 = vec_splat(Vb[4], 2); + VbS1 = vec_splat(Vb[4], 3); + VbS2 = vec_splat(Vb[5], 0); + VbS3 = vec_splat(Vb[5], 1); + VbS4 = vec_splat(Vb[5], 2); + VbS5 = vec_splat(Vb[5], 3); + + Vc2[0] = vec_mul(VbS0, Vc2[0]); + Vc2[1] = vec_mul(VbS0, Vc2[1]); + Vc2[2] = vec_mul(VbS0, Vc2[2]); + Vc2[3] = vec_mul(VbS0, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc3[0] = vec_nmsub(VbS1, Va[ 8], Vc3[0]); + Vc3[1] = vec_nmsub(VbS1, Va[ 9], Vc3[1]); + Vc3[2] = vec_nmsub(VbS1, Va[10], Vc3[2]); + Vc3[3] = vec_nmsub(VbS1, Va[11], Vc3[3]); + Vc4[0] = vec_nmsub(VbS2, Va[ 8], Vc4[0]); + Vc4[1] = vec_nmsub(VbS2, Va[ 9], Vc4[1]); + Vc4[2] = vec_nmsub(VbS2, Va[10], Vc4[2]); + Vc4[3] = vec_nmsub(VbS2, Va[11], Vc4[3]); + Vc5[0] = vec_nmsub(VbS3, Va[ 8], Vc5[0]); + Vc5[1] = vec_nmsub(VbS3, Va[ 9], Vc5[1]); + Vc5[2] = vec_nmsub(VbS3, Va[10], Vc5[2]); + Vc5[3] = vec_nmsub(VbS3, Va[11], Vc5[3]); + Vc6[0] = vec_nmsub(VbS4, Va[ 8], Vc6[0]); + Vc6[1] = vec_nmsub(VbS4, Va[ 9], Vc6[1]); + Vc6[2] = vec_nmsub(VbS4, Va[10], Vc6[2]); + Vc6[3] = vec_nmsub(VbS4, Va[11], Vc6[3]); + Vc7[0] = vec_nmsub(VbS5, Va[ 8], Vc7[0]); + Vc7[1] = vec_nmsub(VbS5, Va[ 9], Vc7[1]); + Vc7[2] = vec_nmsub(VbS5, Va[10], Vc7[2]); + Vc7[3] = vec_nmsub(VbS5, Va[11], Vc7[3]); + + VbS0 = vec_splat(Vb[6], 3); + VbS1 = vec_splat(Vb[7], 0); + VbS2 = vec_splat(Vb[7], 1); + VbS3 = vec_splat(Vb[7], 2); + VbS4 = vec_splat(Vb[7], 3); + + Vc3[0] = vec_mul(VbS0, Vc3[0]); + Vc3[1] = vec_mul(VbS0, Vc3[1]); + Vc3[2] = vec_mul(VbS0, Vc3[2]); + Vc3[3] = vec_mul(VbS0, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc4[0] = vec_nmsub(VbS1, Va[12], Vc4[0]); + Vc4[1] = vec_nmsub(VbS1, Va[13], Vc4[1]); + Vc4[2] = vec_nmsub(VbS1, Va[14], Vc4[2]); + Vc4[3] = vec_nmsub(VbS1, Va[15], Vc4[3]); + Vc5[0] = vec_nmsub(VbS2, Va[12], Vc5[0]); + Vc5[1] = vec_nmsub(VbS2, Va[13], Vc5[1]); + Vc5[2] = vec_nmsub(VbS2, Va[14], Vc5[2]); + Vc5[3] = vec_nmsub(VbS2, Va[15], Vc5[3]); + Vc6[0] = vec_nmsub(VbS3, Va[12], Vc6[0]); + Vc6[1] = vec_nmsub(VbS3, Va[13], Vc6[1]); + Vc6[2] = vec_nmsub(VbS3, Va[14], Vc6[2]); + Vc6[3] = vec_nmsub(VbS3, Va[15], Vc6[3]); + Vc7[0] = vec_nmsub(VbS4, Va[12], Vc7[0]); + Vc7[1] = vec_nmsub(VbS4, Va[13], Vc7[1]); + Vc7[2] = vec_nmsub(VbS4, Va[14], Vc7[2]); + Vc7[3] = vec_nmsub(VbS4, Va[15], Vc7[3]); + + VbS0 = vec_splat(Vb[9], 0); + VbS1 = vec_splat(Vb[9], 1); + VbS2 = vec_splat(Vb[9], 2); + VbS3 = vec_splat(Vb[9], 3); + + Vc4[0] = vec_mul(VbS0, Vc4[0]); + Vc4[1] = vec_mul(VbS0, Vc4[1]); + Vc4[2] = vec_mul(VbS0, Vc4[2]); + Vc4[3] = vec_mul(VbS0, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc5[0] = vec_nmsub(VbS1, Va[16], Vc5[0]); + Vc5[1] = vec_nmsub(VbS1, Va[17], Vc5[1]); + Vc5[2] = vec_nmsub(VbS1, Va[18], Vc5[2]); + Vc5[3] = vec_nmsub(VbS1, Va[19], Vc5[3]); + Vc6[0] = vec_nmsub(VbS2, Va[16], Vc6[0]); + Vc6[1] = vec_nmsub(VbS2, Va[17], Vc6[1]); + Vc6[2] = vec_nmsub(VbS2, Va[18], Vc6[2]); + Vc6[3] = vec_nmsub(VbS2, Va[19], Vc6[3]); + Vc7[0] = vec_nmsub(VbS3, Va[16], Vc7[0]); + Vc7[1] = vec_nmsub(VbS3, Va[17], Vc7[1]); + Vc7[2] = vec_nmsub(VbS3, Va[18], Vc7[2]); + Vc7[3] = vec_nmsub(VbS3, Va[19], Vc7[3]); + + VbS0 = vec_splat(Vb[11], 1); + VbS1 = vec_splat(Vb[11], 2); + VbS2 = vec_splat(Vb[11], 3); + + Vc5[0] = vec_mul(VbS0, Vc5[0]); + Vc5[1] = vec_mul(VbS0, Vc5[1]); + Vc5[2] = vec_mul(VbS0, Vc5[2]); + Vc5[3] = vec_mul(VbS0, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc6[0] = vec_nmsub(VbS1, Va[20], Vc6[0]); + Vc6[1] = vec_nmsub(VbS1, Va[21], Vc6[1]); + Vc6[2] = vec_nmsub(VbS1, Va[22], Vc6[2]); + Vc6[3] = vec_nmsub(VbS1, Va[23], Vc6[3]); + Vc7[0] = vec_nmsub(VbS2, Va[20], Vc7[0]); + Vc7[1] = vec_nmsub(VbS2, Va[21], Vc7[1]); + Vc7[2] = vec_nmsub(VbS2, Va[22], Vc7[2]); + Vc7[3] = vec_nmsub(VbS2, Va[23], Vc7[3]); + + VbS0 = vec_splat(Vb[13], 2); + VbS1 = vec_splat(Vb[13], 3); + + Vc6[0] = vec_mul(VbS0, Vc6[0]); + Vc6[1] = vec_mul(VbS0, Vc6[1]); + Vc6[2] = vec_mul(VbS0, Vc6[2]); + Vc6[3] = vec_mul(VbS0, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc7[0] = vec_nmsub(VbS1, Va[24], Vc7[0]); + Vc7[1] = vec_nmsub(VbS1, Va[25], Vc7[1]); + Vc7[2] = vec_nmsub(VbS1, Va[26], Vc7[2]); + Vc7[3] = vec_nmsub(VbS1, Va[27], Vc7[3]); + + VbS0 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS0, Vc7[0]); + Vc7[1] = vec_mul(VbS0, Vc7[1]); + Vc7[2] = vec_mul(VbS0, Vc7[2]); + Vc7[3] = vec_mul(VbS0, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + for (i = 0; i < n; i++) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = i + 1; k < n; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b += n; + } +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + for (i = 0; i < n; i++) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = -aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = i + 1; k < n; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= - cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b += n * 2; + } +} + +#endif + + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + FLOAT *aa, *cc; + BLASLONG kk; + BLASLONG i, j, jj; + +#if 0 + fprintf(stderr, "TRSM RN KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + + jj = 0; + j = (n >> GEMM_UNROLL_N_SHIFT); + kk = -offset; + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + while (j > 0) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + if (i > 0) { + do { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, b, cc, ldc); + } + solve(i, GEMM_UNROLL_N, + aa + kk * i * COMPSIZE, + b + kk * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + kk += GEMM_UNROLL_N; + b += GEMM_UNROLL_N * k * COMPSIZE; + c += GEMM_UNROLL_N * ldc * COMPSIZE; + j --; + jj += GEMM_UNROLL_M; + } + + if (n & (GEMM_UNROLL_N - 1)) { + + j = (GEMM_UNROLL_N >> 1); + while (j > 0) { + if (n & j) { + + aa = a; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + + while (i > 0) { + if (kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + kk * GEMM_UNROLL_M * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + while (i > 0) { + if (m & i) { + if (kk > 0) { + GEMM_KERNEL(i, j, kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa, + b, + cc, + ldc); + } + + solve(i, j, + aa + kk * i * COMPSIZE, + b + kk * j * COMPSIZE, cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } + } + + b += j * k * COMPSIZE; + c += j * ldc * COMPSIZE; + kk += j; + } + j >>= 1; + } + } + + return 0; +} diff --git a/kernel/power/trsm_kernel_RT_power10.c b/kernel/power/trsm_kernel_RT_power10.c new file mode 100644 index 000000000..529590f37 --- /dev/null +++ b/kernel/power/trsm_kernel_RT_power10.c @@ -0,0 +1,855 @@ +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#include "common.h" +#include + +static FLOAT dm1 = -1.; + +#ifdef CONJ +#define GEMM_KERNEL GEMM_KERNEL_R +#else +#define GEMM_KERNEL GEMM_KERNEL_N +#endif + +#if GEMM_DEFAULT_UNROLL_M == 1 +#define GEMM_UNROLL_M_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 2 +#define GEMM_UNROLL_M_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 4 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 6 +#define GEMM_UNROLL_M_SHIFT 2 +#endif + + +#if GEMM_DEFAULT_UNROLL_M == 8 +#define GEMM_UNROLL_M_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_M == 16 +#define GEMM_UNROLL_M_SHIFT 4 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 1 +#define GEMM_UNROLL_N_SHIFT 0 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 2 +#define GEMM_UNROLL_N_SHIFT 1 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 4 +#define GEMM_UNROLL_N_SHIFT 2 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 8 +#define GEMM_UNROLL_N_SHIFT 3 +#endif + +#if GEMM_DEFAULT_UNROLL_N == 16 +#define GEMM_UNROLL_N_SHIFT 4 +#endif + +#ifndef COMPLEX + +#ifdef DOUBLE + +static inline __attribute__ ((always_inline)) void solve8x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6; + + a[56] = (c7[0] *= b[63]); + a[57] = (c7[1] *= b[63]); + a[58] = (c7[2] *= b[63]); + a[59] = (c7[3] *= b[63]); + a[60] = (c7[4] *= b[63]); + a[61] = (c7[5] *= b[63]); + a[62] = (c7[6] *= b[63]); + a[63] = (c7[7] *= b[63]); + VbS0 = vec_splat(Vb[28], 0); + VbS1 = vec_splat(Vb[28], 1); + VbS2 = vec_splat(Vb[29], 0); + VbS3 = vec_splat(Vb[29], 1); + VbS4 = vec_splat(Vb[30], 0); + VbS5 = vec_splat(Vb[30], 1); + VbS6 = vec_splat(Vb[31], 0); + Vc0[0] = vec_nmsub(Vc7[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc7[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc7[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc7[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc7[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc7[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc7[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc7[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc7[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc7[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc7[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc7[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc7[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc7[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc7[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc7[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc7[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc7[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc7[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc7[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc7[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc7[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc7[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc7[3], VbS5, Vc5[3]); + Vc6[0] = vec_nmsub(Vc7[0], VbS6, Vc6[0]); + Vc6[1] = vec_nmsub(Vc7[1], VbS6, Vc6[1]); + Vc6[2] = vec_nmsub(Vc7[2], VbS6, Vc6[2]); + Vc6[3] = vec_nmsub(Vc7[3], VbS6, Vc6[3]); + + a[48] = (c6[0] *= b[54]); + a[49] = (c6[1] *= b[54]); + a[50] = (c6[2] *= b[54]); + a[51] = (c6[3] *= b[54]); + a[52] = (c6[4] *= b[54]); + a[53] = (c6[5] *= b[54]); + a[54] = (c6[6] *= b[54]); + a[55] = (c6[7] *= b[54]); + VbS0 = vec_splat(Vb[24], 0); + VbS1 = vec_splat(Vb[24], 1); + VbS2 = vec_splat(Vb[25], 0); + VbS3 = vec_splat(Vb[25], 1); + VbS4 = vec_splat(Vb[26], 0); + VbS5 = vec_splat(Vb[26], 1); + Vc0[0] = vec_nmsub(Vc6[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc6[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc6[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc6[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc6[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc6[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc6[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc6[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc6[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc6[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc6[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc6[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc6[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc6[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc6[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc6[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc6[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc6[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc6[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc6[3], VbS4, Vc4[3]); + Vc5[0] = vec_nmsub(Vc6[0], VbS5, Vc5[0]); + Vc5[1] = vec_nmsub(Vc6[1], VbS5, Vc5[1]); + Vc5[2] = vec_nmsub(Vc6[2], VbS5, Vc5[2]); + Vc5[3] = vec_nmsub(Vc6[3], VbS5, Vc5[3]); + + a[40] = (c5[0] *= b[45]); + a[41] = (c5[1] *= b[45]); + a[42] = (c5[2] *= b[45]); + a[43] = (c5[3] *= b[45]); + a[44] = (c5[4] *= b[45]); + a[45] = (c5[5] *= b[45]); + a[46] = (c5[6] *= b[45]); + a[47] = (c5[7] *= b[45]); + VbS0 = vec_splat(Vb[20], 0); + VbS1 = vec_splat(Vb[20], 1); + VbS2 = vec_splat(Vb[21], 0); + VbS3 = vec_splat(Vb[21], 1); + VbS4 = vec_splat(Vb[22], 0); + Vc0[0] = vec_nmsub(Vc5[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc5[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc5[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc5[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc5[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc5[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc5[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc5[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc5[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc5[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc5[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc5[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc5[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc5[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc5[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc5[3], VbS3, Vc3[3]); + Vc4[0] = vec_nmsub(Vc5[0], VbS4, Vc4[0]); + Vc4[1] = vec_nmsub(Vc5[1], VbS4, Vc4[1]); + Vc4[2] = vec_nmsub(Vc5[2], VbS4, Vc4[2]); + Vc4[3] = vec_nmsub(Vc5[3], VbS4, Vc4[3]); + + a[32] = (c4[0] *= b[36]); + a[33] = (c4[1] *= b[36]); + a[34] = (c4[2] *= b[36]); + a[35] = (c4[3] *= b[36]); + a[36] = (c4[4] *= b[36]); + a[37] = (c4[5] *= b[36]); + a[38] = (c4[6] *= b[36]); + a[39] = (c4[7] *= b[36]); + VbS0 = vec_splat(Vb[16], 0); + VbS1 = vec_splat(Vb[16], 1); + VbS2 = vec_splat(Vb[17], 0); + VbS3 = vec_splat(Vb[17], 1); + Vc0[0] = vec_nmsub(Vc4[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc4[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc4[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc4[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc4[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc4[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc4[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc4[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc4[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc4[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc4[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc4[3], VbS2, Vc2[3]); + Vc3[0] = vec_nmsub(Vc4[0], VbS3, Vc3[0]); + Vc3[1] = vec_nmsub(Vc4[1], VbS3, Vc3[1]); + Vc3[2] = vec_nmsub(Vc4[2], VbS3, Vc3[2]); + Vc3[3] = vec_nmsub(Vc4[3], VbS3, Vc3[3]); + + a[24] = (c3[0] *= b[27]); + a[25] = (c3[1] *= b[27]); + a[26] = (c3[2] *= b[27]); + a[27] = (c3[3] *= b[27]); + a[28] = (c3[4] *= b[27]); + a[29] = (c3[5] *= b[27]); + a[30] = (c3[6] *= b[27]); + a[31] = (c3[7] *= b[27]); + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[13], 0); + Vc0[0] = vec_nmsub(Vc3[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc3[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc3[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc3[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc3[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc3[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc3[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc3[3], VbS1, Vc1[3]); + Vc2[0] = vec_nmsub(Vc3[0], VbS2, Vc2[0]); + Vc2[1] = vec_nmsub(Vc3[1], VbS2, Vc2[1]); + Vc2[2] = vec_nmsub(Vc3[2], VbS2, Vc2[2]); + Vc2[3] = vec_nmsub(Vc3[3], VbS2, Vc2[3]); + + a[16] = (c2[0] *= b[18]); + a[17] = (c2[1] *= b[18]); + a[18] = (c2[2] *= b[18]); + a[19] = (c2[3] *= b[18]); + a[20] = (c2[4] *= b[18]); + a[21] = (c2[5] *= b[18]); + a[22] = (c2[6] *= b[18]); + a[23] = (c2[7] *= b[18]); + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + Vc0[0] = vec_nmsub(Vc2[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc2[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc2[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc2[3], VbS0, Vc0[3]); + Vc1[0] = vec_nmsub(Vc2[0], VbS1, Vc1[0]); + Vc1[1] = vec_nmsub(Vc2[1], VbS1, Vc1[1]); + Vc1[2] = vec_nmsub(Vc2[2], VbS1, Vc1[2]); + Vc1[3] = vec_nmsub(Vc2[3], VbS1, Vc1[3]); + + a[ 8] = (c1[0] *= b[9]); + a[ 9] = (c1[1] *= b[9]); + a[10] = (c1[2] *= b[9]); + a[11] = (c1[3] *= b[9]); + a[12] = (c1[4] *= b[9]); + a[13] = (c1[5] *= b[9]); + a[14] = (c1[6] *= b[9]); + a[15] = (c1[7] *= b[9]); + VbS0 = vec_splat(Vb[4], 0); + Vc0[0] = vec_nmsub(Vc1[0], VbS0, Vc0[0]); + Vc0[1] = vec_nmsub(Vc1[1], VbS0, Vc0[1]); + Vc0[2] = vec_nmsub(Vc1[2], VbS0, Vc0[2]); + Vc0[3] = vec_nmsub(Vc1[3], VbS0, Vc0[3]); + + a[0] = (c0[0] *= b[0]); + a[1] = (c0[1] *= b[0]); + a[2] = (c0[2] *= b[0]); + a[3] = (c0[3] *= b[0]); + a[4] = (c0[4] *= b[0]); + a[5] = (c0[5] *= b[0]); + a[6] = (c0[6] *= b[0]); + a[7] = (c0[7] *= b[0]); +} + +#else + +static inline __attribute__ ((always_inline)) void solve16x8(FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + FLOAT *c0, *c1, *c2, *c3, *c4, *c5, *c6, *c7; + c0 = &c[0*ldc]; + c1 = &c[1*ldc]; + c2 = &c[2*ldc]; + c3 = &c[3*ldc]; + c4 = &c[4*ldc]; + c5 = &c[5*ldc]; + c6 = &c[6*ldc]; + c7 = &c[7*ldc]; + + vector FLOAT *Va = (vector FLOAT *) a; + vector FLOAT *Vb = (vector FLOAT *) b; + vector FLOAT *Vc0 = (vector FLOAT *) c0; + vector FLOAT *Vc1 = (vector FLOAT *) c1; + vector FLOAT *Vc2 = (vector FLOAT *) c2; + vector FLOAT *Vc3 = (vector FLOAT *) c3; + vector FLOAT *Vc4 = (vector FLOAT *) c4; + vector FLOAT *Vc5 = (vector FLOAT *) c5; + vector FLOAT *Vc6 = (vector FLOAT *) c6; + vector FLOAT *Vc7 = (vector FLOAT *) c7; + vector FLOAT VbS0, VbS1, VbS2, VbS3, VbS4, VbS5, VbS6, VbS7; + + VbS0 = vec_splat(Vb[14], 0); + VbS1 = vec_splat(Vb[14], 1); + VbS2 = vec_splat(Vb[14], 2); + VbS3 = vec_splat(Vb[14], 3); + VbS4 = vec_splat(Vb[15], 0); + VbS5 = vec_splat(Vb[15], 1); + VbS6 = vec_splat(Vb[15], 2); + VbS7 = vec_splat(Vb[15], 3); + + Vc7[0] = vec_mul(VbS7, Vc7[0]); + Vc7[1] = vec_mul(VbS7, Vc7[1]); + Vc7[2] = vec_mul(VbS7, Vc7[2]); + Vc7[3] = vec_mul(VbS7, Vc7[3]); + Va[28] = Vc7[0]; + Va[29] = Vc7[1]; + Va[30] = Vc7[2]; + Va[31] = Vc7[3]; + Vc0[0] = vec_nmsub(VbS0, Va[28], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[29], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[30], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[31], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[28], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[29], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[30], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[31], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[28], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[29], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[30], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[31], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[28], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[29], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[30], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[31], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[28], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[29], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[30], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[31], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[28], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[29], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[30], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[31], Vc5[3]); + Vc6[0] = vec_nmsub(VbS6, Va[28], Vc6[0]); + Vc6[1] = vec_nmsub(VbS6, Va[29], Vc6[1]); + Vc6[2] = vec_nmsub(VbS6, Va[30], Vc6[2]); + Vc6[3] = vec_nmsub(VbS6, Va[31], Vc6[3]); + + VbS0 = vec_splat(Vb[12], 0); + VbS1 = vec_splat(Vb[12], 1); + VbS2 = vec_splat(Vb[12], 2); + VbS3 = vec_splat(Vb[12], 3); + VbS4 = vec_splat(Vb[13], 0); + VbS5 = vec_splat(Vb[13], 1); + VbS6 = vec_splat(Vb[13], 2); + + Vc6[0] = vec_mul(VbS6, Vc6[0]); + Vc6[1] = vec_mul(VbS6, Vc6[1]); + Vc6[2] = vec_mul(VbS6, Vc6[2]); + Vc6[3] = vec_mul(VbS6, Vc6[3]); + Va[24] = Vc6[0]; + Va[25] = Vc6[1]; + Va[26] = Vc6[2]; + Va[27] = Vc6[3]; + Vc0[0] = vec_nmsub(VbS0, Va[24], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[25], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[26], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[27], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[24], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[25], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[26], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[27], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[24], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[25], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[26], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[27], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[24], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[25], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[26], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[27], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[24], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[25], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[26], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[27], Vc4[3]); + Vc5[0] = vec_nmsub(VbS5, Va[24], Vc5[0]); + Vc5[1] = vec_nmsub(VbS5, Va[25], Vc5[1]); + Vc5[2] = vec_nmsub(VbS5, Va[26], Vc5[2]); + Vc5[3] = vec_nmsub(VbS5, Va[27], Vc5[3]); + + VbS0 = vec_splat(Vb[10], 0); + VbS1 = vec_splat(Vb[10], 1); + VbS2 = vec_splat(Vb[10], 2); + VbS3 = vec_splat(Vb[10], 3); + VbS4 = vec_splat(Vb[11], 0); + VbS5 = vec_splat(Vb[11], 1); + + Vc5[0] = vec_mul(VbS5, Vc5[0]); + Vc5[1] = vec_mul(VbS5, Vc5[1]); + Vc5[2] = vec_mul(VbS5, Vc5[2]); + Vc5[3] = vec_mul(VbS5, Vc5[3]); + Va[20] = Vc5[0]; + Va[21] = Vc5[1]; + Va[22] = Vc5[2]; + Va[23] = Vc5[3]; + Vc0[0] = vec_nmsub(VbS0, Va[20], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[21], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[22], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[23], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[20], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[21], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[22], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[23], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[20], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[21], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[22], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[23], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[20], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[21], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[22], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[23], Vc3[3]); + Vc4[0] = vec_nmsub(VbS4, Va[20], Vc4[0]); + Vc4[1] = vec_nmsub(VbS4, Va[21], Vc4[1]); + Vc4[2] = vec_nmsub(VbS4, Va[22], Vc4[2]); + Vc4[3] = vec_nmsub(VbS4, Va[23], Vc4[3]); + + VbS0 = vec_splat(Vb[8], 0); + VbS1 = vec_splat(Vb[8], 1); + VbS2 = vec_splat(Vb[8], 2); + VbS3 = vec_splat(Vb[8], 3); + VbS4 = vec_splat(Vb[9], 0); + + Vc4[0] = vec_mul(VbS4, Vc4[0]); + Vc4[1] = vec_mul(VbS4, Vc4[1]); + Vc4[2] = vec_mul(VbS4, Vc4[2]); + Vc4[3] = vec_mul(VbS4, Vc4[3]); + Va[16] = Vc4[0]; + Va[17] = Vc4[1]; + Va[18] = Vc4[2]; + Va[19] = Vc4[3]; + Vc0[0] = vec_nmsub(VbS0, Va[16], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[17], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[18], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[19], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[16], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[17], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[18], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[19], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[16], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[17], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[18], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[19], Vc2[3]); + Vc3[0] = vec_nmsub(VbS3, Va[16], Vc3[0]); + Vc3[1] = vec_nmsub(VbS3, Va[17], Vc3[1]); + Vc3[2] = vec_nmsub(VbS3, Va[18], Vc3[2]); + Vc3[3] = vec_nmsub(VbS3, Va[19], Vc3[3]); + + VbS0 = vec_splat(Vb[6], 0); + VbS1 = vec_splat(Vb[6], 1); + VbS2 = vec_splat(Vb[6], 2); + VbS3 = vec_splat(Vb[6], 3); + + Vc3[0] = vec_mul(VbS3, Vc3[0]); + Vc3[1] = vec_mul(VbS3, Vc3[1]); + Vc3[2] = vec_mul(VbS3, Vc3[2]); + Vc3[3] = vec_mul(VbS3, Vc3[3]); + Va[12] = Vc3[0]; + Va[13] = Vc3[1]; + Va[14] = Vc3[2]; + Va[15] = Vc3[3]; + Vc0[0] = vec_nmsub(VbS0, Va[12], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[13], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[14], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[15], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[12], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[13], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[14], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[15], Vc1[3]); + Vc2[0] = vec_nmsub(VbS2, Va[12], Vc2[0]); + Vc2[1] = vec_nmsub(VbS2, Va[13], Vc2[1]); + Vc2[2] = vec_nmsub(VbS2, Va[14], Vc2[2]); + Vc2[3] = vec_nmsub(VbS2, Va[15], Vc2[3]); + + VbS0 = vec_splat(Vb[4], 0); + VbS1 = vec_splat(Vb[4], 1); + VbS2 = vec_splat(Vb[4], 2); + + Vc2[0] = vec_mul(VbS2, Vc2[0]); + Vc2[1] = vec_mul(VbS2, Vc2[1]); + Vc2[2] = vec_mul(VbS2, Vc2[2]); + Vc2[3] = vec_mul(VbS2, Vc2[3]); + Va[ 8] = Vc2[0]; + Va[ 9] = Vc2[1]; + Va[10] = Vc2[2]; + Va[11] = Vc2[3]; + Vc0[0] = vec_nmsub(VbS0, Va[ 8], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[ 9], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[10], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[11], Vc0[3]); + Vc1[0] = vec_nmsub(VbS1, Va[ 8], Vc1[0]); + Vc1[1] = vec_nmsub(VbS1, Va[ 9], Vc1[1]); + Vc1[2] = vec_nmsub(VbS1, Va[10], Vc1[2]); + Vc1[3] = vec_nmsub(VbS1, Va[11], Vc1[3]); + + VbS0 = vec_splat(Vb[2], 0); + VbS1 = vec_splat(Vb[2], 1); + + Vc1[0] = vec_mul(VbS1, Vc1[0]); + Vc1[1] = vec_mul(VbS1, Vc1[1]); + Vc1[2] = vec_mul(VbS1, Vc1[2]); + Vc1[3] = vec_mul(VbS1, Vc1[3]); + Va[4] = Vc1[0]; + Va[5] = Vc1[1]; + Va[6] = Vc1[2]; + Va[7] = Vc1[3]; + Vc0[0] = vec_nmsub(VbS0, Va[4], Vc0[0]); + Vc0[1] = vec_nmsub(VbS0, Va[5], Vc0[1]); + Vc0[2] = vec_nmsub(VbS0, Va[6], Vc0[2]); + Vc0[3] = vec_nmsub(VbS0, Va[7], Vc0[3]); + + VbS0 = vec_splat(Vb[0], 0); + + Vc0[0] = vec_mul(VbS0, Vc0[0]); + Vc0[1] = vec_mul(VbS0, Vc0[1]); + Vc0[2] = vec_mul(VbS0, Vc0[2]); + Vc0[3] = vec_mul(VbS0, Vc0[3]); + Va[0] = Vc0[0]; + Va[1] = Vc0[1]; + Va[2] = Vc0[2]; + Va[3] = Vc0[3]; +} + +#endif + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa, bb; + + int i, j, k; + + a += (n - 1) * m; + b += (n - 1) * n; + + for (i = n - 1; i >= 0; i--) { + + bb = *(b + i); + + for (j = 0; j < m; j ++) { + aa = *(c + j + i * ldc); + aa *= bb; + *a = aa; + *(c + j + i * ldc) = aa; + a ++; + + for (k = 0; k < i; k ++){ + *(c + j + k * ldc) -= aa * *(b + k); + } + + } + b -= n; + a -= 2 * m; + } + +} + +#else + +static inline __attribute__ ((always_inline)) void solve(BLASLONG m, BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc) { + + FLOAT aa1, aa2; + FLOAT bb1, bb2; + FLOAT cc1, cc2; + + int i, j, k; + + ldc *= 2; + + a += (n - 1) * m * 2; + b += (n - 1) * n * 2; + + for (i = n - 1; i >= 0; i--) { + + bb1 = *(b + i * 2 + 0); + bb2 = *(b + i * 2 + 1); + + for (j = 0; j < m; j ++) { + + aa1 = *(c + j * 2 + 0 + i * ldc); + aa2 = *(c + j * 2 + 1 + i * ldc); + +#ifndef CONJ + cc1 = aa1 * bb1 - aa2 * bb2; + cc2 = aa1 * bb2 + aa2 * bb1; +#else + cc1 = aa1 * bb1 + aa2 * bb2; + cc2 = - aa1 * bb2 + aa2 * bb1; +#endif + + *(a + 0) = cc1; + *(a + 1) = cc2; + + *(c + j * 2 + 0 + i * ldc) = cc1; + *(c + j * 2 + 1 + i * ldc) = cc2; + a += 2; + + for (k = 0; k < i; k ++){ +#ifndef CONJ + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) - cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#else + *(c + j * 2 + 0 + k * ldc) -= cc1 * *(b + k * 2 + 0) + cc2 * *(b + k * 2 + 1); + *(c + j * 2 + 1 + k * ldc) -= -cc1 * *(b + k * 2 + 1) + cc2 * *(b + k * 2 + 0); +#endif + } + + } + b -= n * 2; + a -= 4 * m; + } + +} + +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, +#ifdef COMPLEX + FLOAT dummy2, +#endif + FLOAT *a, FLOAT *b, FLOAT *c, BLASLONG ldc, BLASLONG offset){ + + BLASLONG i, j; + FLOAT *aa, *cc; + BLASLONG kk; + +#if 0 + fprintf(stderr, "TRSM RT KERNEL m = %3ld n = %3ld k = %3ld offset = %3ld\n", + m, n, k, offset); +#endif + +#ifdef DOUBLE + int well_aligned = (GEMM_UNROLL_M==8) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#else + int well_aligned = (GEMM_UNROLL_M==16) && (GEMM_UNROLL_N==8) && ((((unsigned long) a) & 0x7) == 0); +#endif + + kk = n - offset; + c += n * ldc * COMPSIZE; + b += n * k * COMPSIZE; + + if (n & (GEMM_UNROLL_N - 1)) { + + j = 1; + while (j < GEMM_UNROLL_N) { + if (n & j) { + + aa = a; + b -= j * k * COMPSIZE; + c -= j * ldc* COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, + ldc); + } + + solve(GEMM_UNROLL_M, j, + aa + (kk - j) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + + if (k - kk > 0) { + GEMM_KERNEL(i, j, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + j * kk * COMPSIZE, + cc, ldc); + } + + solve(i, j, + aa + (kk - j) * i * COMPSIZE, + b + (kk - j) * j * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + + } + i >>= 1; + } while (i > 0); + } + kk -= j; + } + j <<= 1; + } + } + + j = (n >> GEMM_UNROLL_N_SHIFT); + + if (j > 0) { + + do { + aa = a; + b -= GEMM_UNROLL_N * k * COMPSIZE; + c -= GEMM_UNROLL_N * ldc * COMPSIZE; + cc = c; + + i = (m >> GEMM_UNROLL_M_SHIFT); + if (i > 0) { + do { + if (k - kk > 0) { + GEMM_KERNEL(GEMM_UNROLL_M, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + GEMM_UNROLL_M * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + if (well_aligned) { +#ifdef DOUBLE + solve8x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#else + solve16x8(aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, cc, ldc); +#endif + } + else { + solve(GEMM_UNROLL_M, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_M * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + } + + aa += GEMM_UNROLL_M * k * COMPSIZE; + cc += GEMM_UNROLL_M * COMPSIZE; + i --; + } while (i > 0); + } + + if (m & (GEMM_UNROLL_M - 1)) { + i = (GEMM_UNROLL_M >> 1); + do { + if (m & i) { + if (k - kk > 0) { + GEMM_KERNEL(i, GEMM_UNROLL_N, k - kk, dm1, +#ifdef COMPLEX + ZERO, +#endif + aa + i * kk * COMPSIZE, + b + GEMM_UNROLL_N * kk * COMPSIZE, + cc, + ldc); + } + + solve(i, GEMM_UNROLL_N, + aa + (kk - GEMM_UNROLL_N) * i * COMPSIZE, + b + (kk - GEMM_UNROLL_N) * GEMM_UNROLL_N * COMPSIZE, + cc, ldc); + + aa += i * k * COMPSIZE; + cc += i * COMPSIZE; + } + i >>= 1; + } while (i > 0); + } + + kk -= GEMM_UNROLL_N; + j --; + } while (j > 0); + } + + return 0; +} + + diff --git a/kernel/setparam-ref.c b/kernel/setparam-ref.c index d0317a745..1e846a61c 100644 --- a/kernel/setparam-ref.c +++ b/kernel/setparam-ref.c @@ -933,6 +933,77 @@ static void init_parameter(void) { } #else // (ARCH_ARM64) +#if defined(ARCH_MIPS64) +static void init_parameter(void) { + TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P; + TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P; + TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P; + TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P; + + TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q; + TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q; + TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q; + TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q; + + TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R; + TABLE_NAME.dgemm_r = 640; + TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R; + TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R; + +#ifdef EXPRECISION + TABLE_NAME.qgemm_p = QGEMM_DEFAULT_P; + TABLE_NAME.xgemm_p = XGEMM_DEFAULT_P; + TABLE_NAME.qgemm_q = QGEMM_DEFAULT_Q; + TABLE_NAME.xgemm_q = XGEMM_DEFAULT_Q; + TABLE_NAME.qgemm_r = QGEMM_DEFAULT_R; + TABLE_NAME.xgemm_r = XGEMM_DEFAULT_R; +#endif + +#if defined(USE_GEMM3M) +#ifdef CGEMM3M_DEFAULT_P + TABLE_NAME.cgemm3m_p = CGEMM3M_DEFAULT_P; +#else + TABLE_NAME.cgemm3m_p = TABLE_NAME.sgemm_p; +#endif + +#ifdef ZGEMM3M_DEFAULT_P + TABLE_NAME.zgemm3m_p = ZGEMM3M_DEFAULT_P; +#else + TABLE_NAME.zgemm3m_p = TABLE_NAME.dgemm_p; +#endif + +#ifdef CGEMM3M_DEFAULT_Q + TABLE_NAME.cgemm3m_q = CGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.cgemm3m_q = TABLE_NAME.sgemm_q; +#endif + +#ifdef ZGEMM3M_DEFAULT_Q + TABLE_NAME.zgemm3m_q = ZGEMM3M_DEFAULT_Q; +#else + TABLE_NAME.zgemm3m_q = TABLE_NAME.dgemm_q; +#endif + +#ifdef CGEMM3M_DEFAULT_R + TABLE_NAME.cgemm3m_r = CGEMM3M_DEFAULT_R; +#else + TABLE_NAME.cgemm3m_r = TABLE_NAME.sgemm_r; +#endif + +#ifdef ZGEMM3M_DEFAULT_R + TABLE_NAME.zgemm3m_r = ZGEMM3M_DEFAULT_R; +#else + TABLE_NAME.zgemm3m_r = TABLE_NAME.dgemm_r; +#endif + +#ifdef EXPRECISION + TABLE_NAME.xgemm3m_p = TABLE_NAME.qgemm_p; + TABLE_NAME.xgemm3m_q = TABLE_NAME.qgemm_q; + TABLE_NAME.xgemm3m_r = TABLE_NAME.qgemm_r; +#endif +#endif +} +#else // (ARCH_MIPS64) #if (ARCH_POWER) static void init_parameter(void) { @@ -1780,4 +1851,5 @@ static void init_parameter(void) { } #endif //POWER #endif //ZARCH +#endif //(ARCH_MIPS64) #endif //(ARCH_ARM64) diff --git a/param.h b/param.h index a0d45c573..6c5e0f107 100644 --- a/param.h +++ b/param.h @@ -644,9 +644,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 @@ -1552,9 +1553,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define CGEMM_DEFAULT_UNROLL_N 2 #define ZGEMM_DEFAULT_UNROLL_N 2 #define XGEMM_DEFAULT_UNROLL_N 1 - +/* #define SGEMM_DEFAULT_UNROLL_MN 32 #define DGEMM_DEFAULT_UNROLL_MN 32 +*/ #endif #ifdef ARCH_X86 @@ -2570,8 +2572,63 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3A -/*Copy from SICORTEX*/ +#if defined(LOONGSON3R4) +#define SNUMOPT 2 +#define DNUMOPT 2 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#ifdef HAVE_MSA +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 8 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 8 +#define CGEMM_DEFAULT_UNROLL_N 4 + +#define ZGEMM_DEFAULT_UNROLL_M 4 +#define ZGEMM_DEFAULT_UNROLL_N 4 +#else +#define SGEMM_DEFAULT_UNROLL_M 8 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 4 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 4 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 +#endif + +#define SGEMM_DEFAULT_P 64 +#define DGEMM_DEFAULT_P 44 +#define CGEMM_DEFAULT_P 64 +#define ZGEMM_DEFAULT_P 32 + +#define SGEMM_DEFAULT_Q 192 +#define DGEMM_DEFAULT_Q 92 +#define CGEMM_DEFAULT_Q 128 +#define ZGEMM_DEFAULT_Q 80 + +#define SGEMM_DEFAULT_R 640 +#define DGEMM_DEFAULT_R dgemm_r +#define CGEMM_DEFAULT_R 640 +#define ZGEMM_DEFAULT_R 640 + +#define GEMM_OFFSET_A1 0x10000 +#define GEMM_OFFSET_B1 0x100000 + +#define SYMV_P 16 +#endif + +#if defined(LOONGSON3R3) +////Copy from SICORTEX #define SNUMOPT 2 #define DNUMOPT 2 @@ -2612,47 +2669,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif -#ifdef LOONGSON3B -#define SNUMOPT 2 -#define DNUMOPT 2 - -#define GEMM_DEFAULT_OFFSET_A 0 -#define GEMM_DEFAULT_OFFSET_B 0 -#define GEMM_DEFAULT_ALIGN 0x03fffUL - -#define SGEMM_DEFAULT_UNROLL_M 2 -#define SGEMM_DEFAULT_UNROLL_N 2 - -#define DGEMM_DEFAULT_UNROLL_M 2 -#define DGEMM_DEFAULT_UNROLL_N 2 - -#define CGEMM_DEFAULT_UNROLL_M 2 -#define CGEMM_DEFAULT_UNROLL_N 2 - -#define ZGEMM_DEFAULT_UNROLL_M 2 -#define ZGEMM_DEFAULT_UNROLL_N 2 - -#define SGEMM_DEFAULT_P 64 -#define DGEMM_DEFAULT_P 24 -#define CGEMM_DEFAULT_P 24 -#define ZGEMM_DEFAULT_P 20 - -#define SGEMM_DEFAULT_Q 192 -#define DGEMM_DEFAULT_Q 128 -#define CGEMM_DEFAULT_Q 128 -#define ZGEMM_DEFAULT_Q 64 - -#define SGEMM_DEFAULT_R 512 -#define DGEMM_DEFAULT_R 512 -#define CGEMM_DEFAULT_R 512 -#define ZGEMM_DEFAULT_R 512 - -#define GEMM_OFFSET_A1 0x10000 -#define GEMM_OFFSET_B1 0x100000 - -#define SYMV_P 16 -#endif - #if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500) #define SNUMOPT 2 #define DNUMOPT 2