Merge branch 'OpenMathLib:develop' into scalfixes
This commit is contained in:
commit
f5d04318e3
|
@ -28,6 +28,9 @@ jobs:
|
||||||
- target: RISCV64_ZVL256B
|
- target: RISCV64_ZVL256B
|
||||||
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
|
opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
|
||||||
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
|
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
|
||||||
|
- target: DYNAMIC_ARCH=1
|
||||||
|
opts: TARGET=RISCV64_GENERIC BINARY=64 ARCH=riscv64 DYNAMIC_ARCH=1
|
||||||
|
qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
|
||||||
|
|
||||||
steps:
|
steps:
|
||||||
- name: Checkout repository
|
- name: Checkout repository
|
||||||
|
|
|
@ -715,6 +715,17 @@ ifeq ($(ARCH), loongarch64)
|
||||||
DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
|
DYNAMIC_CORE = LOONGSON3R5 LOONGSON2K1000 LOONGSONGENERIC
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq ($(ARCH), riscv64)
|
||||||
|
DYNAMIC_CORE = RISCV64_GENERIC
|
||||||
|
DYNAMIC_CORE += RISCV64_ZVL128B
|
||||||
|
DYNAMIC_CORE += RISCV64_ZVL256B
|
||||||
|
ifdef DYNAMIC_LIST
|
||||||
|
override DYNAMIC_CORE = RISCV64_GENERIC $(DYNAMIC_LIST)
|
||||||
|
XCCOMMON_OPT = -DDYNAMIC_LIST -DDYN_RISCV64_GENERIC
|
||||||
|
XCCOMMON_OPT += $(foreach dcore,$(DYNAMIC_LIST),-DDYN_$(dcore))
|
||||||
|
endif
|
||||||
|
endif
|
||||||
|
|
||||||
ifeq ($(ARCH), zarch)
|
ifeq ($(ARCH), zarch)
|
||||||
DYNAMIC_CORE = ZARCH_GENERIC
|
DYNAMIC_CORE = ZARCH_GENERIC
|
||||||
|
|
||||||
|
|
|
@ -234,6 +234,8 @@ For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additi
|
||||||
|
|
||||||
on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
|
on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.
|
||||||
|
|
||||||
|
On **riscv64**, DYNAMIC_ARCH enables support for riscv64_zvl128b and riscv64_zvl256b in addition to generic riscv64 support. A compiler that supports RVV 1.0 is required to build OpenBLAS for riscv64 when DYNAMIC_ARCH is enabled.
|
||||||
|
|
||||||
The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
|
The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
|
||||||
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
common code in the library, usually you will want to set this to the oldest model you expect to encounter.
|
||||||
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
|
Please note that it is not possible to combine support for different architectures, so no combined 32 and 64 bit or x86_64 and arm64 in the same library.
|
||||||
|
|
|
@ -30,12 +30,16 @@ else
|
||||||
ifeq ($(ARCH),loongarch64)
|
ifeq ($(ARCH),loongarch64)
|
||||||
COMMONOBJS += dynamic_loongarch64.$(SUFFIX)
|
COMMONOBJS += dynamic_loongarch64.$(SUFFIX)
|
||||||
else
|
else
|
||||||
|
ifeq ($(ARCH),riscv64)
|
||||||
|
COMMONOBJS += dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX)
|
||||||
|
else
|
||||||
COMMONOBJS += dynamic.$(SUFFIX)
|
COMMONOBJS += dynamic.$(SUFFIX)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
COMMONOBJS += parameter.$(SUFFIX)
|
COMMONOBJS += parameter.$(SUFFIX)
|
||||||
endif
|
endif
|
||||||
|
@ -106,12 +110,16 @@ else
|
||||||
ifeq ($(ARCH),loongarch64)
|
ifeq ($(ARCH),loongarch64)
|
||||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX)
|
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_loongarch64.$(SUFFIX)
|
||||||
else
|
else
|
||||||
|
ifeq ($(ARCH),riscv64)
|
||||||
|
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic_riscv64.$(SUFFIX) detect_riscv64.$(SUFFIX)
|
||||||
|
else
|
||||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) dynamic.$(SUFFIX)
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
endif
|
endif
|
||||||
|
endif
|
||||||
else
|
else
|
||||||
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
HPLOBJS = memory.$(SUFFIX) xerbla.$(SUFFIX) parameter.$(SUFFIX)
|
||||||
endif
|
endif
|
||||||
|
@ -209,6 +217,9 @@ addx.$(SUFFIX) : $(ARCH)/addx.c
|
||||||
mulx.$(SUFFIX) : $(ARCH)/mulx.c
|
mulx.$(SUFFIX) : $(ARCH)/mulx.c
|
||||||
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F)
|
$(CC) $(CFLAGS) -c -DXDOUBLE -UCOMPLEX $< -o $(@F)
|
||||||
|
|
||||||
|
detect_riscv64.$(SUFFIX): detect_riscv64.c
|
||||||
|
$(CC) $(CFLAGS) -c -march=rv64imafdcv $< -o $(@F)
|
||||||
|
|
||||||
xerbla.$(PSUFFIX) : xerbla.c
|
xerbla.$(PSUFFIX) : xerbla.c
|
||||||
$(CC) $(PFLAGS) -c $< -o $(@F)
|
$(CC) $(PFLAGS) -c $< -o $(@F)
|
||||||
|
|
||||||
|
|
|
@ -0,0 +1,75 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written
|
||||||
|
permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include <stdint.h>
|
||||||
|
|
||||||
|
#ifdef __riscv_v_intrinsic
|
||||||
|
#include <riscv_vector.h>
|
||||||
|
#endif
|
||||||
|
|
||||||
|
unsigned detect_riscv64_get_vlenb(void) {
|
||||||
|
#ifdef __riscv_v_intrinsic
|
||||||
|
return __riscv_vlenb();
|
||||||
|
#else
|
||||||
|
return 0;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Based on the approach taken here:
|
||||||
|
* https://code.videolan.org/videolan/dav1d/-/merge_requests/1629
|
||||||
|
*
|
||||||
|
* Only to be called after we've determined we have some sort of
|
||||||
|
* RVV support.
|
||||||
|
*/
|
||||||
|
|
||||||
|
uint64_t detect_riscv64_rvv100(void)
|
||||||
|
{
|
||||||
|
uint64_t rvv10_supported;
|
||||||
|
|
||||||
|
/*
|
||||||
|
* After the vsetvli statement vtype will either be a value > 0 if the
|
||||||
|
* vsetvli succeeded or less than 0 if it failed. If 0 < vtype
|
||||||
|
* we're good and the function will return 1, otherwise there's no
|
||||||
|
* RVV 1.0 and we return 0.
|
||||||
|
*/
|
||||||
|
|
||||||
|
asm volatile("vsetvli x0, x0, e8, m1, ta, ma\n\t"
|
||||||
|
"csrr %0, vtype\n\t"
|
||||||
|
"slt %0, x0, %0\n"
|
||||||
|
: "=r" (rvv10_supported)
|
||||||
|
:
|
||||||
|
:);
|
||||||
|
|
||||||
|
return rvv10_supported;
|
||||||
|
}
|
||||||
|
|
|
@ -0,0 +1,269 @@
|
||||||
|
/*****************************************************************************
|
||||||
|
Copyright (c) 2024, The OpenBLAS Project
|
||||||
|
All rights reserved.
|
||||||
|
|
||||||
|
Redistribution and use in source and binary forms, with or without
|
||||||
|
modification, are permitted provided that the following conditions are
|
||||||
|
met:
|
||||||
|
|
||||||
|
1. Redistributions of source code must retain the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer.
|
||||||
|
|
||||||
|
2. Redistributions in binary form must reproduce the above copyright
|
||||||
|
notice, this list of conditions and the following disclaimer in
|
||||||
|
the documentation and/or other materials provided with the
|
||||||
|
distribution.
|
||||||
|
3. Neither the name of the OpenBLAS project nor the names of
|
||||||
|
its contributors may be used to endorse or promote products
|
||||||
|
derived from this software without specific prior written
|
||||||
|
permission.
|
||||||
|
|
||||||
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||||
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||||
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||||
|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
|
||||||
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||||
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||||
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||||
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||||
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||||
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
**********************************************************************************/
|
||||||
|
|
||||||
|
#include <stdbool.h>
|
||||||
|
|
||||||
|
#include "common.h"
|
||||||
|
|
||||||
|
/*
|
||||||
|
* OpenBLAS contains some kernels that are optimised for RVV 1.0. Before we
|
||||||
|
* can use these kernels we need to determine whether the device supports
|
||||||
|
* RVV 1.0 and what the device's VLEN is. Our strategy will be as follows.
|
||||||
|
*
|
||||||
|
* First we'll invoke the hwprobe syscall to detect RVV 1.0. In an ideal world,
|
||||||
|
* this is all we should need to do. If the syscall is not implemented we
|
||||||
|
* should be able to deduce that RVV 1.0 is not supported (as it was added to
|
||||||
|
* Linux after hwprobe) and if the syscall is implemented we can use it to
|
||||||
|
* determine whether RVV 1.0 is supported. However, there are some riscv64
|
||||||
|
* boards out there that implement RVV 1.0 but ship with a Linux kernel that
|
||||||
|
* predates RVV vector support and hwprobe support. These kernels contain
|
||||||
|
* the backported RVV patches but not the hwprobe patches and so they
|
||||||
|
* advertise support for RVV via hwcap. To cater for these boards we need
|
||||||
|
* to fall back to hwcap if hwprobe is not supported. Unfortunately, some
|
||||||
|
* boards indicate support for RVV via hwcap even though they only support
|
||||||
|
* RVV 0.7.1, which is incompatible with RVV 1.0. So an additional check is
|
||||||
|
* required to test if the devices advertising support for RVV via hwcap really
|
||||||
|
* support RVV 1.0. This test works by executing a vsetvli instruction that
|
||||||
|
* sets the tail agnostic and mask agnostic bits in the vtype register.
|
||||||
|
* These bits are not supported prior to RVV 0.9 so will cause the VIL bit to
|
||||||
|
* be set on the VTYPE register in CPUs supporting 0.7.1. If this bit is set
|
||||||
|
* we can determine that RVV 1.0 is not supported.
|
||||||
|
*
|
||||||
|
* This approach is borrowed from
|
||||||
|
* VideoLan dav1d:
|
||||||
|
* (https://code.videolan.org/videolan/dav1d/-/merge_requests/1629).
|
||||||
|
*
|
||||||
|
* We assume that if a kernel reports the presence of RVV via hwcap that
|
||||||
|
* the device supports the vsetvli instruction.
|
||||||
|
*
|
||||||
|
* For now we're just going to invoke the hwprobe syscall directly, rather than
|
||||||
|
* invoking it through glibc. Support for hwprobe has been added to glibc but
|
||||||
|
* at the time of writing this support has not yet been included in a glibc
|
||||||
|
* release. Once it has, it will be better to invoke hwprobe via glibc as doing
|
||||||
|
* so should take advantage of the vdso entry and be more efficient.
|
||||||
|
*/
|
||||||
|
|
||||||
|
/*
|
||||||
|
* This should work on Android as well but I have no way of testing.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#if defined(OS_LINUX)
|
||||||
|
#include <unistd.h>
|
||||||
|
#include <sys/syscall.h>
|
||||||
|
#include <stdint.h>
|
||||||
|
#include <sys/auxv.h>
|
||||||
|
|
||||||
|
#define DETECT_RISCV64_HWCAP_ISA_V (1 << ('V' - 'A'))
|
||||||
|
|
||||||
|
struct riscv_hwprobe {
|
||||||
|
int64_t key;
|
||||||
|
uint64_t value;
|
||||||
|
};
|
||||||
|
|
||||||
|
/* The constants below are copied from
|
||||||
|
* /usr/include/riscv64-linux-gnu/asm/hwprobe.h. We duplicate the
|
||||||
|
* constants as the header file from which they are copied will only
|
||||||
|
* be present if we're building on a device with Linux 6.5 or greater.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#define RISCV_HWPROBE_KEY_IMA_EXT_0 4
|
||||||
|
#define RISCV_HWPROBE_IMA_V (1 << 2)
|
||||||
|
|
||||||
|
#ifndef NR_riscv_hwprobe
|
||||||
|
#ifndef NR_arch_specific_syscall
|
||||||
|
#define NR_arch_specific_syscall 244
|
||||||
|
#endif
|
||||||
|
#define NR_riscv_hwprobe (NR_arch_specific_syscall + 14)
|
||||||
|
#endif
|
||||||
|
#endif // defined(OS_LINUX)
|
||||||
|
|
||||||
|
unsigned detect_riscv64_get_vlenb(void);
|
||||||
|
uint64_t detect_riscv64_rvv100(void);
|
||||||
|
|
||||||
|
extern gotoblas_t gotoblas_RISCV64_GENERIC;
|
||||||
|
#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B)
|
||||||
|
extern gotoblas_t gotoblas_RISCV64_ZVL256B;
|
||||||
|
#endif
|
||||||
|
#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B)
|
||||||
|
extern gotoblas_t gotoblas_RISCV64_ZVL128B;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#define CPU_GENERIC 0
|
||||||
|
#define CPU_RISCV64_ZVL256B 1
|
||||||
|
#define CPU_RISCV64_ZVL128B 2
|
||||||
|
|
||||||
|
static char *cpuname[] = {
|
||||||
|
"riscv64_generic",
|
||||||
|
"riscv64_zvl256b",
|
||||||
|
"riscv64_zvl128b"
|
||||||
|
};
|
||||||
|
#define NUM_CORETYPES (sizeof(cpuname)/sizeof(char*))
|
||||||
|
|
||||||
|
extern int openblas_verbose(void);
|
||||||
|
extern void openblas_warning(int verbose, const char* msg);
|
||||||
|
|
||||||
|
char* gotoblas_corename(void) {
|
||||||
|
#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B)
|
||||||
|
if (gotoblas == &gotoblas_RISCV64_ZVL256B)
|
||||||
|
return cpuname[CPU_RISCV64_ZVL256B];
|
||||||
|
#endif
|
||||||
|
#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B)
|
||||||
|
if (gotoblas == &gotoblas_RISCV64_ZVL128B)
|
||||||
|
return cpuname[CPU_RISCV64_ZVL128B];
|
||||||
|
#endif
|
||||||
|
if (gotoblas == &gotoblas_RISCV64_GENERIC)
|
||||||
|
return cpuname[CPU_GENERIC];
|
||||||
|
|
||||||
|
return "unknown";
|
||||||
|
}
|
||||||
|
|
||||||
|
static gotoblas_t* get_coretype(void) {
|
||||||
|
unsigned vlenb = 0;
|
||||||
|
|
||||||
|
#if !defined(OS_LINUX)
|
||||||
|
return NULL;
|
||||||
|
#else
|
||||||
|
|
||||||
|
/*
|
||||||
|
* See the hwprobe documentation
|
||||||
|
*
|
||||||
|
* ( https://docs.kernel.org/arch/riscv/hwprobe.html )
|
||||||
|
* for more details.
|
||||||
|
*/
|
||||||
|
|
||||||
|
struct riscv_hwprobe pairs[] = {
|
||||||
|
{ .key = RISCV_HWPROBE_KEY_IMA_EXT_0, },
|
||||||
|
};
|
||||||
|
int ret = syscall(NR_riscv_hwprobe, pairs, 1, 0, NULL, 0);
|
||||||
|
if (ret == 0) {
|
||||||
|
if (!(pairs[0].value & RISCV_HWPROBE_IMA_V))
|
||||||
|
return NULL;
|
||||||
|
} else {
|
||||||
|
if (!(getauxval(AT_HWCAP) & DETECT_RISCV64_HWCAP_ISA_V))
|
||||||
|
return NULL;
|
||||||
|
|
||||||
|
if (!detect_riscv64_rvv100())
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* RVV 1.0 is supported. We now just need to determine the coretype
|
||||||
|
* based on the VLEN.
|
||||||
|
*/
|
||||||
|
|
||||||
|
vlenb = detect_riscv64_get_vlenb();
|
||||||
|
|
||||||
|
if (vlenb < 16)
|
||||||
|
return NULL;
|
||||||
|
#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B)
|
||||||
|
if (vlenb >= 32)
|
||||||
|
return &gotoblas_RISCV64_ZVL256B;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B)
|
||||||
|
return &gotoblas_RISCV64_ZVL128B;
|
||||||
|
#else
|
||||||
|
return NULL;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#endif // !defined(OS_LINUX)
|
||||||
|
}
|
||||||
|
|
||||||
|
static gotoblas_t* force_coretype(char* coretype) {
|
||||||
|
size_t i;
|
||||||
|
char message[128];
|
||||||
|
|
||||||
|
for (i = 0; i < NUM_CORETYPES && strcasecmp(coretype, cpuname[i]); i++);
|
||||||
|
|
||||||
|
if (i == CPU_GENERIC)
|
||||||
|
return &gotoblas_RISCV64_GENERIC;
|
||||||
|
|
||||||
|
if (i == CPU_RISCV64_ZVL256B) {
|
||||||
|
#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL256B)
|
||||||
|
return &gotoblas_RISCV64_ZVL256B;
|
||||||
|
#else
|
||||||
|
openblas_warning(1,
|
||||||
|
"riscv64_zvl256b support not compiled in\n");
|
||||||
|
return NULL;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
if (i == CPU_RISCV64_ZVL128B) {
|
||||||
|
#if !defined(DYNAMIC_LIST) || defined(DYN_RISCV64_ZVL128B)
|
||||||
|
return &gotoblas_RISCV64_ZVL128B;
|
||||||
|
#else
|
||||||
|
openblas_warning(1,
|
||||||
|
"riscv64_zvl128b support not compiled in\n");
|
||||||
|
return NULL;
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
snprintf(message, sizeof(message), "Core not found: %s\n", coretype);
|
||||||
|
openblas_warning(1, message);
|
||||||
|
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
|
void gotoblas_dynamic_init(void) {
|
||||||
|
|
||||||
|
char coremsg[128];
|
||||||
|
char* p;
|
||||||
|
|
||||||
|
if (gotoblas) return;
|
||||||
|
|
||||||
|
p = getenv("OPENBLAS_CORETYPE");
|
||||||
|
if (p)
|
||||||
|
gotoblas = force_coretype(p);
|
||||||
|
else
|
||||||
|
gotoblas = get_coretype();
|
||||||
|
|
||||||
|
if (!gotoblas) {
|
||||||
|
snprintf(coremsg, sizeof(coremsg), "Falling back to generic riscv64 core\n");
|
||||||
|
openblas_warning(1, coremsg);
|
||||||
|
gotoblas = &gotoblas_RISCV64_GENERIC;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (gotoblas->init) {
|
||||||
|
snprintf(coremsg, sizeof(coremsg), "Core: %s\n",
|
||||||
|
gotoblas_corename());
|
||||||
|
openblas_warning(2, coremsg);
|
||||||
|
gotoblas->init();
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
openblas_warning(0, "OpenBLAS : Architecture Initialization failed. No initialization function found.\n");
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
void gotoblas_dynamic_quit(void) {
|
||||||
|
gotoblas = NULL;
|
||||||
|
}
|
|
@ -864,15 +864,15 @@ LL(22):
|
||||||
LFD f22, 10 * SIZE(BO)
|
LFD f22, 10 * SIZE(BO)
|
||||||
LFD f23, 11 * SIZE(BO)
|
LFD f23, 11 * SIZE(BO)
|
||||||
|
|
||||||
FMADD f2, f18, f24, f2
|
FMADD f0, f18, f24, f0
|
||||||
FMADD f3, f19, f24, f3
|
FMADD f1, f19, f24, f1
|
||||||
FMADD f6, f18, f25, f6
|
FMADD f4, f18, f25, f4
|
||||||
FMADD f7, f19, f25, f7
|
FMADD f5, f19, f25, f5
|
||||||
|
|
||||||
FMADD f10, f18, f26, f10
|
FMADD f8, f18, f26, f8
|
||||||
FMADD f11, f19, f26, f11
|
FMADD f9, f19, f26, f9
|
||||||
FMADD f14, f18, f27, f14
|
FMADD f12, f18, f27, f12
|
||||||
FMADD f15, f19, f27, f15
|
FMADD f13, f19, f27, f13
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
|
@ -899,15 +899,15 @@ LL(22):
|
||||||
LFD f22, 18 * SIZE(BO)
|
LFD f22, 18 * SIZE(BO)
|
||||||
LFD f23, 19 * SIZE(BO)
|
LFD f23, 19 * SIZE(BO)
|
||||||
|
|
||||||
FMADD f2, f18, f24, f2
|
FMADD f0, f18, f24, f0
|
||||||
FMADD f3, f19, f24, f3
|
FMADD f1, f19, f24, f1
|
||||||
FMADD f6, f18, f25, f6
|
FMADD f4, f18, f25, f4
|
||||||
FMADD f7, f19, f25, f7
|
FMADD f5, f19, f25, f5
|
||||||
|
|
||||||
FMADD f10, f18, f26, f10
|
FMADD f8, f18, f26, f8
|
||||||
FMADD f11, f19, f26, f11
|
FMADD f9, f19, f26, f9
|
||||||
FMADD f14, f18, f27, f14
|
FMADD f12, f18, f27, f12
|
||||||
FMADD f15, f19, f27, f15
|
FMADD f13, f19, f27, f13
|
||||||
|
|
||||||
LFD f16, 8 * SIZE(AO)
|
LFD f16, 8 * SIZE(AO)
|
||||||
LFD f17, 9 * SIZE(AO)
|
LFD f17, 9 * SIZE(AO)
|
||||||
|
@ -923,14 +923,6 @@ LL(22):
|
||||||
addi BO, BO, 16 * SIZE
|
addi BO, BO, 16 * SIZE
|
||||||
bdnz LL(22)
|
bdnz LL(22)
|
||||||
|
|
||||||
fadd f0, f2, f0
|
|
||||||
fadd f1, f3, f1
|
|
||||||
fadd f4, f6, f4
|
|
||||||
fadd f5, f7, f5
|
|
||||||
fadd f8, f10, f8
|
|
||||||
fadd f9, f11, f9
|
|
||||||
fadd f12, f14, f12
|
|
||||||
fadd f13, f15, f13
|
|
||||||
.align 4
|
.align 4
|
||||||
|
|
||||||
LL(25):
|
LL(25):
|
||||||
|
@ -1161,10 +1153,10 @@ LL(32):
|
||||||
LFD f22, 10 * SIZE(BO)
|
LFD f22, 10 * SIZE(BO)
|
||||||
LFD f23, 11 * SIZE(BO)
|
LFD f23, 11 * SIZE(BO)
|
||||||
|
|
||||||
FMADD f1, f17, f24, f1
|
FMADD f0, f17, f24, f0
|
||||||
FMADD f5, f17, f25, f5
|
FMADD f4, f17, f25, f4
|
||||||
FMADD f9, f17, f26, f9
|
FMADD f8, f17, f26, f8
|
||||||
FMADD f13, f17, f27, f13
|
FMADD f12, f17, f27, f12
|
||||||
|
|
||||||
LFD f24, 12 * SIZE(BO)
|
LFD f24, 12 * SIZE(BO)
|
||||||
LFD f25, 13 * SIZE(BO)
|
LFD f25, 13 * SIZE(BO)
|
||||||
|
@ -1181,10 +1173,10 @@ LL(32):
|
||||||
LFD f22, 18 * SIZE(BO)
|
LFD f22, 18 * SIZE(BO)
|
||||||
LFD f23, 19 * SIZE(BO)
|
LFD f23, 19 * SIZE(BO)
|
||||||
|
|
||||||
FMADD f1, f19, f24, f1
|
FMADD f0, f19, f24, f0
|
||||||
FMADD f5, f19, f25, f5
|
FMADD f4, f19, f25, f4
|
||||||
FMADD f9, f19, f26, f9
|
FMADD f8, f19, f26, f8
|
||||||
FMADD f13, f19, f27, f13
|
FMADD f12, f19, f27, f12
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
|
@ -1200,10 +1192,6 @@ LL(32):
|
||||||
addi BO, BO, 16 * SIZE
|
addi BO, BO, 16 * SIZE
|
||||||
bdnz LL(32)
|
bdnz LL(32)
|
||||||
|
|
||||||
fadd f0, f1, f0
|
|
||||||
fadd f4, f5, f4
|
|
||||||
fadd f8, f9, f8
|
|
||||||
fadd f12, f13, f12
|
|
||||||
.align 4
|
.align 4
|
||||||
|
|
||||||
LL(35):
|
LL(35):
|
||||||
|
@ -1691,10 +1679,10 @@ LL(52):
|
||||||
FMADD f2, f16, f21, f2
|
FMADD f2, f16, f21, f2
|
||||||
FMADD f3, f17, f21, f3
|
FMADD f3, f17, f21, f3
|
||||||
|
|
||||||
FMADD f4, f18, f22, f4
|
FMADD f0, f18, f22, f0
|
||||||
FMADD f5, f19, f22, f5
|
FMADD f1, f19, f22, f1
|
||||||
FMADD f6, f18, f23, f6
|
FMADD f2, f18, f23, f2
|
||||||
FMADD f7, f19, f23, f7
|
FMADD f3, f19, f23, f3
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
|
@ -1711,10 +1699,10 @@ LL(52):
|
||||||
FMADD f2, f16, f25, f2
|
FMADD f2, f16, f25, f2
|
||||||
FMADD f3, f17, f25, f3
|
FMADD f3, f17, f25, f3
|
||||||
|
|
||||||
FMADD f4, f18, f26, f4
|
FMADD f0, f18, f26, f0
|
||||||
FMADD f5, f19, f26, f5
|
FMADD f1, f19, f26, f1
|
||||||
FMADD f6, f18, f27, f6
|
FMADD f2, f18, f27, f2
|
||||||
FMADD f7, f19, f27, f7
|
FMADD f3, f19, f27, f3
|
||||||
|
|
||||||
LFD f16, 8 * SIZE(AO)
|
LFD f16, 8 * SIZE(AO)
|
||||||
LFD f17, 9 * SIZE(AO)
|
LFD f17, 9 * SIZE(AO)
|
||||||
|
@ -1775,21 +1763,11 @@ LL(58):
|
||||||
LFD f18, 0 * SIZE(CO2)
|
LFD f18, 0 * SIZE(CO2)
|
||||||
LFD f19, 1 * SIZE(CO2)
|
LFD f19, 1 * SIZE(CO2)
|
||||||
|
|
||||||
FADD f0, f4, f0
|
|
||||||
FADD f1, f5, f1
|
|
||||||
FADD f2, f6, f2
|
|
||||||
FADD f3, f7, f3
|
|
||||||
|
|
||||||
FMADD f0, f0, f30, f16
|
FMADD f0, f0, f30, f16
|
||||||
FMADD f1, f1, f30, f17
|
FMADD f1, f1, f30, f17
|
||||||
FMADD f2, f2, f30, f18
|
FMADD f2, f2, f30, f18
|
||||||
FMADD f3, f3, f30, f19
|
FMADD f3, f3, f30, f19
|
||||||
#else
|
#else
|
||||||
FADD f0, f4, f0
|
|
||||||
FADD f1, f5, f1
|
|
||||||
FADD f2, f6, f2
|
|
||||||
FADD f3, f7, f3
|
|
||||||
|
|
||||||
FMUL f0, f0, f30
|
FMUL f0, f0, f30
|
||||||
FMUL f1, f1, f30
|
FMUL f1, f1, f30
|
||||||
FMUL f2, f2, f30
|
FMUL f2, f2, f30
|
||||||
|
@ -1916,8 +1894,8 @@ LL(60):
|
||||||
LL(62):
|
LL(62):
|
||||||
FMADD f0, f16, f20, f0
|
FMADD f0, f16, f20, f0
|
||||||
FMADD f1, f16, f21, f1
|
FMADD f1, f16, f21, f1
|
||||||
FMADD f2, f17, f22, f2
|
FMADD f0, f17, f22, f0
|
||||||
FMADD f3, f17, f23, f3
|
FMADD f1, f17, f23, f1
|
||||||
|
|
||||||
LFD f20, 8 * SIZE(BO)
|
LFD f20, 8 * SIZE(BO)
|
||||||
LFD f21, 9 * SIZE(BO)
|
LFD f21, 9 * SIZE(BO)
|
||||||
|
@ -1926,8 +1904,8 @@ LL(62):
|
||||||
|
|
||||||
FMADD f0, f18, f24, f0
|
FMADD f0, f18, f24, f0
|
||||||
FMADD f1, f18, f25, f1
|
FMADD f1, f18, f25, f1
|
||||||
FMADD f2, f19, f26, f2
|
FMADD f0, f19, f26, f0
|
||||||
FMADD f3, f19, f27, f3
|
FMADD f1, f19, f27, f1
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
|
@ -1986,15 +1964,9 @@ LL(68):
|
||||||
LFD f16, 0 * SIZE(CO1)
|
LFD f16, 0 * SIZE(CO1)
|
||||||
LFD f18, 0 * SIZE(CO2)
|
LFD f18, 0 * SIZE(CO2)
|
||||||
|
|
||||||
FADD f0, f2, f0
|
|
||||||
FADD f1, f3, f1
|
|
||||||
|
|
||||||
FMADD f0, f0, f30, f16
|
FMADD f0, f0, f30, f16
|
||||||
FMADD f1, f1, f30, f18
|
FMADD f1, f1, f30, f18
|
||||||
#else
|
#else
|
||||||
FADD f0, f2, f0
|
|
||||||
FADD f1, f3, f1
|
|
||||||
|
|
||||||
FMUL f0, f0, f30
|
FMUL f0, f0, f30
|
||||||
FMUL f1, f1, f30
|
FMUL f1, f1, f30
|
||||||
#endif
|
#endif
|
||||||
|
@ -2007,7 +1979,6 @@ LL(68):
|
||||||
fmr f4, f0
|
fmr f4, f0
|
||||||
fmr f5, f0
|
fmr f5, f0
|
||||||
|
|
||||||
|
|
||||||
#ifdef TRMMKERNEL
|
#ifdef TRMMKERNEL
|
||||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||||
(!defined(LEFT) && !defined(TRANSA))
|
(!defined(LEFT) && !defined(TRANSA))
|
||||||
|
@ -2332,8 +2303,8 @@ LL(80):
|
||||||
LL(82):
|
LL(82):
|
||||||
FMADD f0, f16, f20, f0
|
FMADD f0, f16, f20, f0
|
||||||
FMADD f1, f17, f20, f1
|
FMADD f1, f17, f20, f1
|
||||||
FMADD f2, f18, f21, f2
|
FMADD f0, f18, f21, f0
|
||||||
FMADD f3, f19, f21, f3
|
FMADD f1, f19, f21, f1
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
|
@ -2342,8 +2313,8 @@ LL(82):
|
||||||
|
|
||||||
FMADD f0, f16, f22, f0
|
FMADD f0, f16, f22, f0
|
||||||
FMADD f1, f17, f22, f1
|
FMADD f1, f17, f22, f1
|
||||||
FMADD f2, f18, f23, f2
|
FMADD f0, f18, f23, f0
|
||||||
FMADD f3, f19, f23, f3
|
FMADD f1, f19, f23, f1
|
||||||
|
|
||||||
LFD f16, 8 * SIZE(AO)
|
LFD f16, 8 * SIZE(AO)
|
||||||
LFD f17, 9 * SIZE(AO)
|
LFD f17, 9 * SIZE(AO)
|
||||||
|
@ -2401,15 +2372,9 @@ LL(88):
|
||||||
LFD f16, 0 * SIZE(CO1)
|
LFD f16, 0 * SIZE(CO1)
|
||||||
LFD f17, 1 * SIZE(CO1)
|
LFD f17, 1 * SIZE(CO1)
|
||||||
|
|
||||||
FADD f0, f2, f0
|
|
||||||
FADD f1, f3, f1
|
|
||||||
|
|
||||||
FMADD f0, f0, f30, f16
|
FMADD f0, f0, f30, f16
|
||||||
FMADD f1, f1, f30, f17
|
FMADD f1, f1, f30, f17
|
||||||
#else
|
#else
|
||||||
FADD f0, f2, f0
|
|
||||||
FADD f1, f3, f1
|
|
||||||
|
|
||||||
FMUL f0, f0, f30
|
FMUL f0, f0, f30
|
||||||
FMUL f1, f1, f30
|
FMUL f1, f1, f30
|
||||||
#endif
|
#endif
|
||||||
|
@ -2418,9 +2383,6 @@ LL(88):
|
||||||
STFD f1, 1 * SIZE(CO1)
|
STFD f1, 1 * SIZE(CO1)
|
||||||
|
|
||||||
lfs f0, FZERO
|
lfs f0, FZERO
|
||||||
fmr f1, f0
|
|
||||||
fmr f2, f0
|
|
||||||
fmr f3, f0
|
|
||||||
|
|
||||||
addi CO1, CO1, 2 * SIZE
|
addi CO1, CO1, 2 * SIZE
|
||||||
|
|
||||||
|
@ -2512,9 +2474,9 @@ LL(90):
|
||||||
|
|
||||||
LL(92):
|
LL(92):
|
||||||
FMADD f0, f16, f20, f0
|
FMADD f0, f16, f20, f0
|
||||||
FMADD f1, f17, f21, f1
|
FMADD f0, f17, f21, f0
|
||||||
FMADD f2, f18, f22, f2
|
FMADD f0, f18, f22, f0
|
||||||
FMADD f3, f19, f23, f3
|
FMADD f0, f19, f23, f0
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
|
@ -2527,9 +2489,9 @@ LL(92):
|
||||||
LFD f23, 7 * SIZE(BO)
|
LFD f23, 7 * SIZE(BO)
|
||||||
|
|
||||||
FMADD f0, f16, f20, f0
|
FMADD f0, f16, f20, f0
|
||||||
FMADD f1, f17, f21, f1
|
FMADD f0, f17, f21, f0
|
||||||
FMADD f2, f18, f22, f2
|
FMADD f0, f18, f22, f0
|
||||||
FMADD f3, f19, f23, f3
|
FMADD f0, f19, f23, f0
|
||||||
|
|
||||||
LFD f16, 8 * SIZE(AO)
|
LFD f16, 8 * SIZE(AO)
|
||||||
LFD f17, 9 * SIZE(AO)
|
LFD f17, 9 * SIZE(AO)
|
||||||
|
@ -2583,16 +2545,8 @@ LL(98):
|
||||||
#ifndef TRMMKERNEL
|
#ifndef TRMMKERNEL
|
||||||
LFD f16, 0 * SIZE(CO1)
|
LFD f16, 0 * SIZE(CO1)
|
||||||
|
|
||||||
FADD f0, f1, f0
|
|
||||||
FADD f2, f3, f2
|
|
||||||
FADD f0, f2, f0
|
|
||||||
|
|
||||||
FMADD f0, f0, f30, f16
|
FMADD f0, f0, f30, f16
|
||||||
#else
|
#else
|
||||||
FADD f0, f1, f0
|
|
||||||
FADD f2, f3, f2
|
|
||||||
FADD f0, f2, f0
|
|
||||||
|
|
||||||
FMUL f0, f0, f30
|
FMUL f0, f0, f30
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
|
@ -1159,9 +1159,9 @@ LL(20):
|
||||||
|
|
||||||
LL(22):
|
LL(22):
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
|
|
||||||
LFD f28, 4 * SIZE(AO)
|
LFD f28, 4 * SIZE(AO)
|
||||||
LFD f29, 5 * SIZE(AO)
|
LFD f29, 5 * SIZE(AO)
|
||||||
|
@ -1169,9 +1169,9 @@ LL(22):
|
||||||
LFD f31, 7 * SIZE(AO)
|
LFD f31, 7 * SIZE(AO)
|
||||||
|
|
||||||
FMA1 f4, f16, f22, f4
|
FMA1 f4, f16, f22, f4
|
||||||
FMA4 f7, f17, f22, f7
|
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA3 f6, f17, f23, f6
|
FMA4 f5, f17, f22, f5
|
||||||
|
FMA3 f4, f17, f23, f4
|
||||||
|
|
||||||
LFD f20, 8 * SIZE(BO)
|
LFD f20, 8 * SIZE(BO)
|
||||||
LFD f21, 9 * SIZE(BO)
|
LFD f21, 9 * SIZE(BO)
|
||||||
|
@ -1179,14 +1179,14 @@ LL(22):
|
||||||
LFD f23, 11 * SIZE(BO)
|
LFD f23, 11 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f8, f16, f24, f8
|
FMA1 f8, f16, f24, f8
|
||||||
FMA4 f11, f17, f24, f11
|
|
||||||
FMA2 f9, f16, f25, f9
|
FMA2 f9, f16, f25, f9
|
||||||
FMA3 f10, f17, f25, f10
|
FMA4 f9, f17, f24, f9
|
||||||
|
FMA3 f8, f17, f25, f8
|
||||||
|
|
||||||
FMA1 f12, f16, f26, f12
|
FMA1 f12, f16, f26, f12
|
||||||
FMA4 f15, f17, f26, f15
|
|
||||||
FMA2 f13, f16, f27, f13
|
FMA2 f13, f16, f27, f13
|
||||||
FMA3 f14, f17, f27, f14
|
FMA4 f13, f17, f26, f13
|
||||||
|
FMA3 f12, f17, f27, f12
|
||||||
|
|
||||||
LFD f24, 12 * SIZE(BO)
|
LFD f24, 12 * SIZE(BO)
|
||||||
LFD f25, 13 * SIZE(BO)
|
LFD f25, 13 * SIZE(BO)
|
||||||
|
@ -1194,14 +1194,14 @@ LL(22):
|
||||||
LFD f27, 15 * SIZE(BO)
|
LFD f27, 15 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f0, f18, f20, f0
|
FMA1 f0, f18, f20, f0
|
||||||
FMA4 f3, f19, f20, f3
|
|
||||||
FMA2 f1, f18, f21, f1
|
FMA2 f1, f18, f21, f1
|
||||||
FMA3 f2, f19, f21, f2
|
FMA4 f1, f19, f20, f1
|
||||||
|
FMA3 f0, f19, f21, f0
|
||||||
|
|
||||||
FMA1 f4, f18, f22, f4
|
FMA1 f4, f18, f22, f4
|
||||||
FMA4 f7, f19, f22, f7
|
|
||||||
FMA2 f5, f18, f23, f5
|
FMA2 f5, f18, f23, f5
|
||||||
FMA3 f6, f19, f23, f6
|
FMA4 f5, f19, f22, f5
|
||||||
|
FMA3 f4, f19, f23, f4
|
||||||
|
|
||||||
LFD f20, 16 * SIZE(BO)
|
LFD f20, 16 * SIZE(BO)
|
||||||
LFD f21, 17 * SIZE(BO)
|
LFD f21, 17 * SIZE(BO)
|
||||||
|
@ -1209,14 +1209,14 @@ LL(22):
|
||||||
LFD f23, 19 * SIZE(BO)
|
LFD f23, 19 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f8, f18, f24, f8
|
FMA1 f8, f18, f24, f8
|
||||||
FMA4 f11, f19, f24, f11
|
|
||||||
FMA2 f9, f18, f25, f9
|
FMA2 f9, f18, f25, f9
|
||||||
FMA3 f10, f19, f25, f10
|
FMA4 f9, f19, f24, f9
|
||||||
|
FMA3 f8, f19, f25, f8
|
||||||
|
|
||||||
FMA1 f12, f18, f26, f12
|
FMA1 f12, f18, f26, f12
|
||||||
FMA4 f15, f19, f26, f15
|
|
||||||
FMA2 f13, f18, f27, f13
|
FMA2 f13, f18, f27, f13
|
||||||
FMA3 f14, f19, f27, f14
|
FMA4 f13, f19, f26, f13
|
||||||
|
FMA3 f12, f19, f27, f12
|
||||||
|
|
||||||
LFD f24, 20 * SIZE(BO)
|
LFD f24, 20 * SIZE(BO)
|
||||||
LFD f25, 21 * SIZE(BO)
|
LFD f25, 21 * SIZE(BO)
|
||||||
|
@ -1224,9 +1224,9 @@ LL(22):
|
||||||
LFD f27, 23 * SIZE(BO)
|
LFD f27, 23 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f0, f28, f20, f0
|
FMA1 f0, f28, f20, f0
|
||||||
FMA4 f3, f29, f20, f3
|
|
||||||
FMA2 f1, f28, f21, f1
|
FMA2 f1, f28, f21, f1
|
||||||
FMA3 f2, f29, f21, f2
|
FMA4 f1, f29, f20, f1
|
||||||
|
FMA3 f0, f29, f21, f0
|
||||||
|
|
||||||
LFD f16, 8 * SIZE(AO)
|
LFD f16, 8 * SIZE(AO)
|
||||||
LFD f17, 9 * SIZE(AO)
|
LFD f17, 9 * SIZE(AO)
|
||||||
|
@ -1234,9 +1234,9 @@ LL(22):
|
||||||
LFD f19, 11 * SIZE(AO)
|
LFD f19, 11 * SIZE(AO)
|
||||||
|
|
||||||
FMA1 f4, f28, f22, f4
|
FMA1 f4, f28, f22, f4
|
||||||
FMA4 f7, f29, f22, f7
|
|
||||||
FMA2 f5, f28, f23, f5
|
FMA2 f5, f28, f23, f5
|
||||||
FMA3 f6, f29, f23, f6
|
FMA4 f5, f29, f22, f5
|
||||||
|
FMA3 f4, f29, f23, f4
|
||||||
|
|
||||||
LFD f20, 24 * SIZE(BO)
|
LFD f20, 24 * SIZE(BO)
|
||||||
LFD f21, 25 * SIZE(BO)
|
LFD f21, 25 * SIZE(BO)
|
||||||
|
@ -1244,14 +1244,14 @@ LL(22):
|
||||||
LFD f23, 27 * SIZE(BO)
|
LFD f23, 27 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f8, f28, f24, f8
|
FMA1 f8, f28, f24, f8
|
||||||
FMA4 f11, f29, f24, f11
|
|
||||||
FMA2 f9, f28, f25, f9
|
FMA2 f9, f28, f25, f9
|
||||||
FMA3 f10, f29, f25, f10
|
FMA4 f9, f29, f24, f9
|
||||||
|
FMA3 f8, f29, f25, f8
|
||||||
|
|
||||||
FMA1 f12, f28, f26, f12
|
FMA1 f12, f28, f26, f12
|
||||||
FMA4 f15, f29, f26, f15
|
|
||||||
FMA2 f13, f28, f27, f13
|
FMA2 f13, f28, f27, f13
|
||||||
FMA3 f14, f29, f27, f14
|
FMA4 f13, f29, f26, f13
|
||||||
|
FMA3 f12, f29, f27, f12
|
||||||
|
|
||||||
LFD f24, 28 * SIZE(BO)
|
LFD f24, 28 * SIZE(BO)
|
||||||
LFD f25, 29 * SIZE(BO)
|
LFD f25, 29 * SIZE(BO)
|
||||||
|
@ -1259,14 +1259,14 @@ LL(22):
|
||||||
LFD f27, 31 * SIZE(BO)
|
LFD f27, 31 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f0, f30, f20, f0
|
FMA1 f0, f30, f20, f0
|
||||||
FMA4 f3, f31, f20, f3
|
|
||||||
FMA2 f1, f30, f21, f1
|
FMA2 f1, f30, f21, f1
|
||||||
FMA3 f2, f31, f21, f2
|
FMA4 f1, f31, f20, f1
|
||||||
|
FMA3 f0, f31, f21, f0
|
||||||
|
|
||||||
FMA1 f4, f30, f22, f4
|
FMA1 f4, f30, f22, f4
|
||||||
FMA4 f7, f31, f22, f7
|
|
||||||
FMA2 f5, f30, f23, f5
|
FMA2 f5, f30, f23, f5
|
||||||
FMA3 f6, f31, f23, f6
|
FMA4 f5, f31, f22, f5
|
||||||
|
FMA3 f4, f31, f23, f4
|
||||||
|
|
||||||
LFD f20, 32 * SIZE(BO)
|
LFD f20, 32 * SIZE(BO)
|
||||||
LFD f21, 33 * SIZE(BO)
|
LFD f21, 33 * SIZE(BO)
|
||||||
|
@ -1274,14 +1274,14 @@ LL(22):
|
||||||
LFD f23, 35 * SIZE(BO)
|
LFD f23, 35 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f8, f30, f24, f8
|
FMA1 f8, f30, f24, f8
|
||||||
FMA4 f11, f31, f24, f11
|
|
||||||
FMA2 f9, f30, f25, f9
|
FMA2 f9, f30, f25, f9
|
||||||
FMA3 f10, f31, f25, f10
|
FMA4 f9, f31, f24, f9
|
||||||
|
FMA3 f8, f31, f25, f8
|
||||||
|
|
||||||
FMA1 f12, f30, f26, f12
|
FMA1 f12, f30, f26, f12
|
||||||
FMA4 f15, f31, f26, f15
|
|
||||||
FMA2 f13, f30, f27, f13
|
FMA2 f13, f30, f27, f13
|
||||||
FMA3 f14, f31, f27, f14
|
FMA4 f13, f31, f26, f13
|
||||||
|
FMA3 f12, f31, f27, f12
|
||||||
|
|
||||||
LFD f24, 36 * SIZE(BO)
|
LFD f24, 36 * SIZE(BO)
|
||||||
LFD f25, 37 * SIZE(BO)
|
LFD f25, 37 * SIZE(BO)
|
||||||
|
@ -1318,14 +1318,14 @@ LL(25):
|
||||||
|
|
||||||
LL(26):
|
LL(26):
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
|
|
||||||
FMA1 f4, f16, f22, f4
|
FMA1 f4, f16, f22, f4
|
||||||
FMA4 f7, f17, f22, f7
|
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA3 f6, f17, f23, f6
|
FMA4 f5, f17, f22, f5
|
||||||
|
FMA3 f4, f17, f23, f4
|
||||||
|
|
||||||
LFD f20, 8 * SIZE(BO)
|
LFD f20, 8 * SIZE(BO)
|
||||||
LFD f21, 9 * SIZE(BO)
|
LFD f21, 9 * SIZE(BO)
|
||||||
|
@ -1333,14 +1333,14 @@ LL(26):
|
||||||
LFD f23, 11 * SIZE(BO)
|
LFD f23, 11 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f8, f16, f24, f8
|
FMA1 f8, f16, f24, f8
|
||||||
FMA4 f11, f17, f24, f11
|
|
||||||
FMA2 f9, f16, f25, f9
|
FMA2 f9, f16, f25, f9
|
||||||
FMA3 f10, f17, f25, f10
|
FMA4 f9, f17, f24, f9
|
||||||
|
FMA3 f8, f17, f25, f8
|
||||||
|
|
||||||
FMA1 f12, f16, f26, f12
|
FMA1 f12, f16, f26, f12
|
||||||
FMA4 f15, f17, f26, f15
|
|
||||||
FMA2 f13, f16, f27, f13
|
FMA2 f13, f16, f27, f13
|
||||||
FMA3 f14, f17, f27, f14
|
FMA4 f13, f17, f26, f13
|
||||||
|
FMA3 f12, f17, f27, f12
|
||||||
|
|
||||||
LFD f16, 2 * SIZE(AO)
|
LFD f16, 2 * SIZE(AO)
|
||||||
LFD f17, 3 * SIZE(AO)
|
LFD f17, 3 * SIZE(AO)
|
||||||
|
@ -1363,47 +1363,42 @@ LL(28):
|
||||||
LFD f18, 0 * SIZE(CO2)
|
LFD f18, 0 * SIZE(CO2)
|
||||||
LFD f19, 1 * SIZE(CO2)
|
LFD f19, 1 * SIZE(CO2)
|
||||||
|
|
||||||
FADD f0, f0, f2
|
|
||||||
FADD f1, f1, f3
|
|
||||||
FADD f4, f4, f6
|
|
||||||
FADD f5, f5, f7
|
|
||||||
|
|
||||||
LFD f20, 0 * SIZE(CO3)
|
LFD f20, 0 * SIZE(CO3)
|
||||||
LFD f21, 1 * SIZE(CO3)
|
LFD f21, 1 * SIZE(CO3)
|
||||||
LFD f22, 0 * SIZE(CO4)
|
LFD f22, 0 * SIZE(CO4)
|
||||||
LFD f23, 1 * SIZE(CO4)
|
LFD f23, 1 * SIZE(CO4)
|
||||||
|
|
||||||
FADD f8, f8, f10
|
fmr f2, f0
|
||||||
FADD f9, f9, f11
|
fmr f3, f1
|
||||||
FADD f12, f12, f14
|
fmr f6, f4
|
||||||
FADD f13, f13, f15
|
fmr f7, f5
|
||||||
|
|
||||||
FNMSUB f24, f31, f1, f16
|
FMADD f24, f30, f0, f16
|
||||||
FMADD f25, f31, f0, f17
|
FMADD f25, f30, f1, f17
|
||||||
FNMSUB f26, f31, f5, f18
|
FMADD f26, f30, f4, f18
|
||||||
FMADD f27, f31, f4, f19
|
FMADD f27, f30, f5, f19
|
||||||
|
|
||||||
FMADD f0, f30, f0, f24
|
FNMSUB f0, f31, f3, f24
|
||||||
FMADD f1, f30, f1, f25
|
FMADD f1, f31, f2, f25
|
||||||
FMADD f4, f30, f4, f26
|
FNMSUB f4, f31, f7, f26
|
||||||
FMADD f5, f30, f5, f27
|
FMADD f5, f31, f6, f27
|
||||||
|
|
||||||
FNMSUB f24, f31, f9, f20
|
fmr f10, f8
|
||||||
FMADD f25, f31, f8, f21
|
fmr f11, f9
|
||||||
FNMSUB f26, f31, f13, f22
|
fmr f14, f12
|
||||||
FMADD f27, f31, f12, f23
|
fmr f15, f13
|
||||||
|
|
||||||
FMADD f8, f30, f8, f24
|
FMADD f24, f30, f8, f20
|
||||||
FMADD f9, f30, f9, f25
|
FMADD f25, f30, f9, f21
|
||||||
FMADD f12, f30, f12, f26
|
FMADD f26, f30, f12, f22
|
||||||
FMADD f13, f30, f13, f27
|
FMADD f27, f30, f13, f23
|
||||||
|
|
||||||
|
FNMSUB f8, f31, f11, f24
|
||||||
|
FMADD f9, f31, f10, f25
|
||||||
|
FNMSUB f12, f31, f15, f26
|
||||||
|
FMADD f13, f31, f14, f27
|
||||||
|
|
||||||
#else
|
#else
|
||||||
FADD f0, f0, f2
|
|
||||||
FADD f1, f1, f3
|
|
||||||
FADD f4, f4, f6
|
|
||||||
FADD f5, f5, f7
|
|
||||||
|
|
||||||
FMUL f16, f31, f1
|
FMUL f16, f31, f1
|
||||||
FMUL f17, f31, f0
|
FMUL f17, f31, f0
|
||||||
FMUL f18, f31, f5
|
FMUL f18, f31, f5
|
||||||
|
@ -1414,11 +1409,6 @@ LL(28):
|
||||||
FMSUB f4, f30, f4, f18
|
FMSUB f4, f30, f4, f18
|
||||||
FMADD f5, f30, f5, f19
|
FMADD f5, f30, f5, f19
|
||||||
|
|
||||||
FADD f8, f8, f10
|
|
||||||
FADD f9, f9, f11
|
|
||||||
FADD f12, f12, f14
|
|
||||||
FADD f13, f13, f15
|
|
||||||
|
|
||||||
FMUL f20, f31, f9
|
FMUL f20, f31, f9
|
||||||
FMUL f21, f31, f8
|
FMUL f21, f31, f8
|
||||||
FMUL f22, f31, f13
|
FMUL f22, f31, f13
|
||||||
|
@ -1616,15 +1606,15 @@ LL(32):
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA2 f7, f18, f23, f7
|
FMA2 f7, f18, f23, f7
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
FMA4 f13, f17, f22, f13
|
FMA4 f5, f17, f22, f5
|
||||||
FMA4 f15, f19, f22, f15
|
FMA4 f7, f19, f22, f7
|
||||||
FMA3 f12, f17, f23, f12
|
FMA3 f4, f17, f23, f4
|
||||||
FMA3 f14, f19, f23, f14
|
FMA3 f6, f19, f23, f6
|
||||||
|
|
||||||
LFD f20, 8 * SIZE(BO)
|
LFD f20, 8 * SIZE(BO)
|
||||||
LFD f21, 9 * SIZE(BO)
|
LFD f21, 9 * SIZE(BO)
|
||||||
|
@ -1646,15 +1636,15 @@ LL(32):
|
||||||
FMA2 f5, f28, f27, f5
|
FMA2 f5, f28, f27, f5
|
||||||
FMA2 f7, f30, f27, f7
|
FMA2 f7, f30, f27, f7
|
||||||
|
|
||||||
FMA4 f9, f29, f24, f9
|
FMA4 f1, f29, f24, f1
|
||||||
FMA4 f11, f31, f24, f11
|
FMA4 f3, f31, f24, f3
|
||||||
FMA3 f8, f29, f25, f8
|
FMA3 f0, f29, f25, f0
|
||||||
FMA3 f10, f31, f25, f10
|
FMA3 f2, f31, f25, f2
|
||||||
|
|
||||||
FMA4 f13, f29, f26, f13
|
FMA4 f5, f29, f26, f5
|
||||||
FMA4 f15, f31, f26, f15
|
FMA4 f7, f31, f26, f7
|
||||||
FMA3 f12, f29, f27, f12
|
FMA3 f4, f29, f27, f4
|
||||||
FMA3 f14, f31, f27, f14
|
FMA3 f6, f31, f27, f6
|
||||||
|
|
||||||
LFD f24, 12 * SIZE(BO)
|
LFD f24, 12 * SIZE(BO)
|
||||||
LFD f25, 13 * SIZE(BO)
|
LFD f25, 13 * SIZE(BO)
|
||||||
|
@ -1676,15 +1666,15 @@ LL(32):
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA2 f7, f18, f23, f7
|
FMA2 f7, f18, f23, f7
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
FMA4 f13, f17, f22, f13
|
FMA4 f5, f17, f22, f5
|
||||||
FMA4 f15, f19, f22, f15
|
FMA4 f7, f19, f22, f7
|
||||||
FMA3 f12, f17, f23, f12
|
FMA3 f4, f17, f23, f4
|
||||||
FMA3 f14, f19, f23, f14
|
FMA3 f6, f19, f23, f6
|
||||||
|
|
||||||
LFD f20, 16 * SIZE(BO)
|
LFD f20, 16 * SIZE(BO)
|
||||||
LFD f21, 17 * SIZE(BO)
|
LFD f21, 17 * SIZE(BO)
|
||||||
|
@ -1706,15 +1696,15 @@ LL(32):
|
||||||
FMA2 f5, f28, f27, f5
|
FMA2 f5, f28, f27, f5
|
||||||
FMA2 f7, f30, f27, f7
|
FMA2 f7, f30, f27, f7
|
||||||
|
|
||||||
FMA4 f9, f29, f24, f9
|
FMA4 f1, f29, f24, f1
|
||||||
FMA4 f11, f31, f24, f11
|
FMA4 f3, f31, f24, f3
|
||||||
FMA3 f8, f29, f25, f8
|
FMA3 f0, f29, f25, f0
|
||||||
FMA3 f10, f31, f25, f10
|
FMA3 f2, f31, f25, f2
|
||||||
|
|
||||||
FMA4 f13, f29, f26, f13
|
FMA4 f5, f29, f26, f5
|
||||||
FMA4 f15, f31, f26, f15
|
FMA4 f7, f31, f26, f7
|
||||||
FMA3 f12, f29, f27, f12
|
FMA3 f4, f29, f27, f4
|
||||||
FMA3 f14, f31, f27, f14
|
FMA3 f6, f31, f27, f6
|
||||||
|
|
||||||
LFD f24, 20 * SIZE(BO)
|
LFD f24, 20 * SIZE(BO)
|
||||||
LFD f25, 21 * SIZE(BO)
|
LFD f25, 21 * SIZE(BO)
|
||||||
|
@ -1736,15 +1726,15 @@ LL(32):
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA2 f7, f18, f23, f7
|
FMA2 f7, f18, f23, f7
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
FMA4 f13, f17, f22, f13
|
FMA4 f5, f17, f22, f5
|
||||||
FMA4 f15, f19, f22, f15
|
FMA4 f7, f19, f22, f7
|
||||||
FMA3 f12, f17, f23, f12
|
FMA3 f4, f17, f23, f4
|
||||||
FMA3 f14, f19, f23, f14
|
FMA3 f6, f19, f23, f6
|
||||||
|
|
||||||
LFD f20, 24 * SIZE(BO)
|
LFD f20, 24 * SIZE(BO)
|
||||||
LFD f21, 25 * SIZE(BO)
|
LFD f21, 25 * SIZE(BO)
|
||||||
|
@ -1766,15 +1756,15 @@ LL(32):
|
||||||
FMA2 f5, f28, f27, f5
|
FMA2 f5, f28, f27, f5
|
||||||
FMA2 f7, f30, f27, f7
|
FMA2 f7, f30, f27, f7
|
||||||
|
|
||||||
FMA4 f9, f29, f24, f9
|
FMA4 f1, f29, f24, f1
|
||||||
FMA4 f11, f31, f24, f11
|
FMA4 f3, f31, f24, f3
|
||||||
FMA3 f8, f29, f25, f8
|
FMA3 f0, f29, f25, f0
|
||||||
FMA3 f10, f31, f25, f10
|
FMA3 f2, f31, f25, f2
|
||||||
|
|
||||||
FMA4 f13, f29, f26, f13
|
FMA4 f5, f29, f26, f5
|
||||||
FMA4 f15, f31, f26, f15
|
FMA4 f7, f31, f26, f7
|
||||||
FMA3 f12, f29, f27, f12
|
FMA3 f4, f29, f27, f4
|
||||||
FMA3 f14, f31, f27, f14
|
FMA3 f6, f31, f27, f6
|
||||||
|
|
||||||
LFD f24, 28 * SIZE(BO)
|
LFD f24, 28 * SIZE(BO)
|
||||||
LFD f25, 29 * SIZE(BO)
|
LFD f25, 29 * SIZE(BO)
|
||||||
|
@ -1796,15 +1786,15 @@ LL(32):
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA2 f7, f18, f23, f7
|
FMA2 f7, f18, f23, f7
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
FMA4 f13, f17, f22, f13
|
FMA4 f5, f17, f22, f5
|
||||||
FMA4 f15, f19, f22, f15
|
FMA4 f7, f19, f22, f7
|
||||||
FMA3 f12, f17, f23, f12
|
FMA3 f4, f17, f23, f4
|
||||||
FMA3 f14, f19, f23, f14
|
FMA3 f6, f19, f23, f6
|
||||||
|
|
||||||
LFD f20, 32 * SIZE(BO)
|
LFD f20, 32 * SIZE(BO)
|
||||||
LFD f21, 33 * SIZE(BO)
|
LFD f21, 33 * SIZE(BO)
|
||||||
|
@ -1826,15 +1816,15 @@ LL(32):
|
||||||
FMA2 f5, f28, f27, f5
|
FMA2 f5, f28, f27, f5
|
||||||
FMA2 f7, f30, f27, f7
|
FMA2 f7, f30, f27, f7
|
||||||
|
|
||||||
FMA4 f9, f29, f24, f9
|
FMA4 f1, f29, f24, f1
|
||||||
FMA4 f11, f31, f24, f11
|
FMA4 f3, f31, f24, f3
|
||||||
FMA3 f8, f29, f25, f8
|
FMA3 f0, f29, f25, f0
|
||||||
FMA3 f10, f31, f25, f10
|
FMA3 f2, f31, f25, f2
|
||||||
|
|
||||||
FMA4 f13, f29, f26, f13
|
FMA4 f5, f29, f26, f5
|
||||||
FMA4 f15, f31, f26, f15
|
FMA4 f7, f31, f26, f7
|
||||||
FMA3 f12, f29, f27, f12
|
FMA3 f4, f29, f27, f4
|
||||||
FMA3 f14, f31, f27, f14
|
FMA3 f6, f31, f27, f6
|
||||||
|
|
||||||
LFD f24, 36 * SIZE(BO)
|
LFD f24, 36 * SIZE(BO)
|
||||||
LFD f25, 37 * SIZE(BO)
|
LFD f25, 37 * SIZE(BO)
|
||||||
|
@ -1883,20 +1873,20 @@ LL(36):
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA2 f7, f18, f23, f7
|
FMA2 f7, f18, f23, f7
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f18, 6 * SIZE(AO)
|
LFD f18, 6 * SIZE(AO)
|
||||||
LFD f20, 4 * SIZE(BO)
|
LFD f20, 4 * SIZE(BO)
|
||||||
LFD f21, 5 * SIZE(BO)
|
LFD f21, 5 * SIZE(BO)
|
||||||
|
|
||||||
FMA4 f13, f17, f22, f13
|
FMA4 f5, f17, f22, f5
|
||||||
FMA4 f15, f19, f22, f15
|
FMA4 f7, f19, f22, f7
|
||||||
FMA3 f12, f17, f23, f12
|
FMA3 f4, f17, f23, f4
|
||||||
FMA3 f14, f19, f23, f14
|
FMA3 f6, f19, f23, f6
|
||||||
|
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
LFD f19, 7 * SIZE(AO)
|
LFD f19, 7 * SIZE(AO)
|
||||||
|
@ -1916,52 +1906,42 @@ LL(38):
|
||||||
LFD f18, 2 * SIZE(CO1)
|
LFD f18, 2 * SIZE(CO1)
|
||||||
LFD f19, 3 * SIZE(CO1)
|
LFD f19, 3 * SIZE(CO1)
|
||||||
|
|
||||||
FADD f0, f0, f8
|
|
||||||
FADD f1, f1, f9
|
|
||||||
FADD f2, f2, f10
|
|
||||||
FADD f3, f3, f11
|
|
||||||
|
|
||||||
LFD f20, 0 * SIZE(CO2)
|
LFD f20, 0 * SIZE(CO2)
|
||||||
LFD f21, 1 * SIZE(CO2)
|
LFD f21, 1 * SIZE(CO2)
|
||||||
LFD f22, 2 * SIZE(CO2)
|
LFD f22, 2 * SIZE(CO2)
|
||||||
LFD f23, 3 * SIZE(CO2)
|
LFD f23, 3 * SIZE(CO2)
|
||||||
|
|
||||||
FADD f4, f4, f12
|
fmr f8, f0
|
||||||
FADD f5, f5, f13
|
fmr f9, f1
|
||||||
FADD f6, f6, f14
|
fmr f10, f2
|
||||||
FADD f7, f7, f15
|
fmr f11, f3
|
||||||
|
|
||||||
FNMSUB f24, f31, f1, f16
|
FMADD f24, f30, f0, f16
|
||||||
FMADD f25, f31, f0, f17
|
FMADD f25, f30, f1, f17
|
||||||
FNMSUB f26, f31, f3, f18
|
FMADD f26, f30, f2, f18
|
||||||
FMADD f27, f31, f2, f19
|
FMADD f27, f30, f3, f19
|
||||||
|
|
||||||
FMADD f0, f30, f0, f24
|
FNMSUB f0, f31, f9, f24
|
||||||
FMADD f1, f30, f1, f25
|
FMADD f1, f31, f8, f25
|
||||||
FMADD f2, f30, f2, f26
|
FNMSUB f2, f31, f11, f26
|
||||||
FMADD f3, f30, f3, f27
|
FMADD f3, f31, f10, f27
|
||||||
|
|
||||||
FNMSUB f24, f31, f5, f20
|
fmr f12, f4
|
||||||
FMADD f25, f31, f4, f21
|
fmr f13, f5
|
||||||
FNMSUB f26, f31, f7, f22
|
fmr f14, f6
|
||||||
FMADD f27, f31, f6, f23
|
fmr f15, f7
|
||||||
|
|
||||||
FMADD f4, f30, f4, f24
|
FMADD f24, f30, f4, f20
|
||||||
FMADD f5, f30, f5, f25
|
FMADD f25, f30, f5, f21
|
||||||
FMADD f6, f30, f6, f26
|
FMADD f26, f30, f6, f22
|
||||||
FMADD f7, f30, f7, f27
|
FMADD f27, f30, f7, f23
|
||||||
|
|
||||||
|
FNMSUB f4, f31, f13, f24
|
||||||
|
FMADD f5, f31, f12, f25
|
||||||
|
FNMSUB f6, f31, f15, f26
|
||||||
|
FMADD f7, f31, f14, f27
|
||||||
|
|
||||||
#else
|
#else
|
||||||
FADD f0, f0, f8
|
|
||||||
FADD f1, f1, f9
|
|
||||||
FADD f2, f2, f10
|
|
||||||
FADD f3, f3, f11
|
|
||||||
|
|
||||||
FADD f4, f4, f12
|
|
||||||
FADD f5, f5, f13
|
|
||||||
FADD f6, f6, f14
|
|
||||||
FADD f7, f7, f15
|
|
||||||
|
|
||||||
FMUL f16, f31, f1
|
FMUL f16, f31, f1
|
||||||
FMUL f17, f31, f0
|
FMUL f17, f31, f0
|
||||||
FMUL f18, f31, f3
|
FMUL f18, f31, f3
|
||||||
|
@ -2101,14 +2081,14 @@ LL(40):
|
||||||
|
|
||||||
LL(42):
|
LL(42):
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
|
|
||||||
FMA1 f4, f16, f22, f4
|
FMA1 f4, f16, f22, f4
|
||||||
FMA4 f7, f17, f22, f7
|
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA3 f6, f17, f23, f6
|
FMA4 f5, f17, f22, f5
|
||||||
|
FMA3 f4, f17, f23, f4
|
||||||
|
|
||||||
LFD f16, 2 * SIZE(AO)
|
LFD f16, 2 * SIZE(AO)
|
||||||
LFD f17, 3 * SIZE(AO)
|
LFD f17, 3 * SIZE(AO)
|
||||||
|
@ -2119,14 +2099,14 @@ LL(42):
|
||||||
LFD f23, 7 * SIZE(BO)
|
LFD f23, 7 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
|
|
||||||
FMA1 f4, f16, f22, f4
|
FMA1 f4, f16, f22, f4
|
||||||
FMA4 f7, f17, f22, f7
|
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA3 f6, f17, f23, f6
|
FMA4 f5, f17, f22, f5
|
||||||
|
FMA3 f4, f17, f23, f4
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
|
@ -2137,14 +2117,14 @@ LL(42):
|
||||||
LFD f23, 11 * SIZE(BO)
|
LFD f23, 11 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
|
|
||||||
FMA1 f4, f16, f22, f4
|
FMA1 f4, f16, f22, f4
|
||||||
FMA4 f7, f17, f22, f7
|
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA3 f6, f17, f23, f6
|
FMA4 f5, f17, f22, f5
|
||||||
|
FMA3 f4, f17, f23, f4
|
||||||
|
|
||||||
LFD f16, 6 * SIZE(AO)
|
LFD f16, 6 * SIZE(AO)
|
||||||
LFD f17, 7 * SIZE(AO)
|
LFD f17, 7 * SIZE(AO)
|
||||||
|
@ -2155,14 +2135,14 @@ LL(42):
|
||||||
LFD f23, 15 * SIZE(BO)
|
LFD f23, 15 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
|
|
||||||
FMA1 f4, f16, f22, f4
|
FMA1 f4, f16, f22, f4
|
||||||
FMA4 f7, f17, f22, f7
|
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA3 f6, f17, f23, f6
|
FMA4 f5, f17, f22, f5
|
||||||
|
FMA3 f4, f17, f23, f4
|
||||||
|
|
||||||
LFD f16, 8 * SIZE(AO)
|
LFD f16, 8 * SIZE(AO)
|
||||||
LFD f17, 9 * SIZE(AO)
|
LFD f17, 9 * SIZE(AO)
|
||||||
|
@ -2202,14 +2182,14 @@ LL(45):
|
||||||
|
|
||||||
LL(46):
|
LL(46):
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
|
|
||||||
FMA1 f4, f16, f22, f4
|
FMA1 f4, f16, f22, f4
|
||||||
FMA4 f7, f17, f22, f7
|
|
||||||
FMA2 f5, f16, f23, f5
|
FMA2 f5, f16, f23, f5
|
||||||
FMA3 f6, f17, f23, f6
|
FMA4 f5, f17, f22, f5
|
||||||
|
FMA3 f4, f17, f23, f4
|
||||||
|
|
||||||
LFD f16, 2 * SIZE(AO)
|
LFD f16, 2 * SIZE(AO)
|
||||||
LFD f17, 3 * SIZE(AO)
|
LFD f17, 3 * SIZE(AO)
|
||||||
|
@ -2231,27 +2211,22 @@ LL(48):
|
||||||
LFD f20, 0 * SIZE(CO2)
|
LFD f20, 0 * SIZE(CO2)
|
||||||
LFD f21, 1 * SIZE(CO2)
|
LFD f21, 1 * SIZE(CO2)
|
||||||
|
|
||||||
FADD f0, f0, f2
|
fmr f2, f0
|
||||||
FADD f1, f1, f3
|
fmr f3, f1
|
||||||
FADD f4, f4, f6
|
fmr f6, f4
|
||||||
FADD f5, f5, f7
|
fmr f7, f5
|
||||||
|
|
||||||
FNMSUB f24, f31, f1, f16
|
FMADD f24, f30, f0, f16
|
||||||
FMADD f25, f31, f0, f17
|
FMADD f25, f30, f1, f17
|
||||||
FNMSUB f26, f31, f5, f20
|
FMADD f26, f30, f4, f20
|
||||||
FMADD f27, f31, f4, f21
|
FMADD f27, f30, f5, f21
|
||||||
|
|
||||||
FMADD f0, f30, f0, f24
|
FNMSUB f0, f31, f3, f24
|
||||||
FMADD f1, f30, f1, f25
|
FMADD f1, f31, f2, f25
|
||||||
FMADD f4, f30, f4, f26
|
FNMSUB f4, f31, f7, f26
|
||||||
FMADD f5, f30, f5, f27
|
FMADD f5, f31, f6, f27
|
||||||
|
|
||||||
#else
|
#else
|
||||||
FADD f0, f0, f2
|
|
||||||
FADD f1, f1, f3
|
|
||||||
FADD f4, f4, f6
|
|
||||||
FADD f5, f5, f7
|
|
||||||
|
|
||||||
FMUL f16, f31, f1
|
FMUL f16, f31, f1
|
||||||
FMUL f17, f31, f0
|
FMUL f17, f31, f0
|
||||||
FMUL f18, f31, f5
|
FMUL f18, f31, f5
|
||||||
|
@ -2401,10 +2376,10 @@ LL(52):
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA2 f3, f18, f21, f3
|
FMA2 f3, f18, f21, f3
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
|
@ -2416,10 +2391,10 @@ LL(52):
|
||||||
FMA2 f1, f16, f23, f1
|
FMA2 f1, f16, f23, f1
|
||||||
FMA2 f3, f18, f23, f3
|
FMA2 f3, f18, f23, f3
|
||||||
|
|
||||||
FMA4 f9, f17, f22, f9
|
FMA4 f1, f17, f22, f1
|
||||||
FMA4 f11, f19, f22, f11
|
FMA4 f3, f19, f22, f3
|
||||||
FMA3 f8, f17, f23, f8
|
FMA3 f0, f17, f23, f0
|
||||||
FMA3 f10, f19, f23, f10
|
FMA3 f2, f19, f23, f2
|
||||||
|
|
||||||
LFD f16, 8 * SIZE(AO)
|
LFD f16, 8 * SIZE(AO)
|
||||||
LFD f17, 9 * SIZE(AO)
|
LFD f17, 9 * SIZE(AO)
|
||||||
|
@ -2436,10 +2411,10 @@ LL(52):
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA2 f3, f18, f21, f3
|
FMA2 f3, f18, f21, f3
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
LFD f16, 12 * SIZE(AO)
|
LFD f16, 12 * SIZE(AO)
|
||||||
LFD f17, 13 * SIZE(AO)
|
LFD f17, 13 * SIZE(AO)
|
||||||
|
@ -2451,10 +2426,10 @@ LL(52):
|
||||||
FMA2 f1, f16, f23, f1
|
FMA2 f1, f16, f23, f1
|
||||||
FMA2 f3, f18, f23, f3
|
FMA2 f3, f18, f23, f3
|
||||||
|
|
||||||
FMA4 f9, f17, f22, f9
|
FMA4 f1, f17, f22, f1
|
||||||
FMA4 f11, f19, f22, f11
|
FMA4 f3, f19, f22, f3
|
||||||
FMA3 f8, f17, f23, f8
|
FMA3 f0, f17, f23, f0
|
||||||
FMA3 f10, f19, f23, f10
|
FMA3 f2, f19, f23, f2
|
||||||
|
|
||||||
LFD f16, 16 * SIZE(AO)
|
LFD f16, 16 * SIZE(AO)
|
||||||
LFD f17, 17 * SIZE(AO)
|
LFD f17, 17 * SIZE(AO)
|
||||||
|
@ -2471,10 +2446,10 @@ LL(52):
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA2 f3, f18, f21, f3
|
FMA2 f3, f18, f21, f3
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
LFD f16, 20 * SIZE(AO)
|
LFD f16, 20 * SIZE(AO)
|
||||||
LFD f17, 21 * SIZE(AO)
|
LFD f17, 21 * SIZE(AO)
|
||||||
|
@ -2486,10 +2461,10 @@ LL(52):
|
||||||
FMA2 f1, f16, f23, f1
|
FMA2 f1, f16, f23, f1
|
||||||
FMA2 f3, f18, f23, f3
|
FMA2 f3, f18, f23, f3
|
||||||
|
|
||||||
FMA4 f9, f17, f22, f9
|
FMA4 f1, f17, f22, f1
|
||||||
FMA4 f11, f19, f22, f11
|
FMA4 f3, f19, f22, f3
|
||||||
FMA3 f8, f17, f23, f8
|
FMA3 f0, f17, f23, f0
|
||||||
FMA3 f10, f19, f23, f10
|
FMA3 f2, f19, f23, f2
|
||||||
|
|
||||||
LFD f16, 24 * SIZE(AO)
|
LFD f16, 24 * SIZE(AO)
|
||||||
LFD f17, 25 * SIZE(AO)
|
LFD f17, 25 * SIZE(AO)
|
||||||
|
@ -2506,10 +2481,10 @@ LL(52):
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA2 f3, f18, f21, f3
|
FMA2 f3, f18, f21, f3
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
LFD f16, 28 * SIZE(AO)
|
LFD f16, 28 * SIZE(AO)
|
||||||
LFD f17, 29 * SIZE(AO)
|
LFD f17, 29 * SIZE(AO)
|
||||||
|
@ -2521,10 +2496,10 @@ LL(52):
|
||||||
FMA2 f1, f16, f23, f1
|
FMA2 f1, f16, f23, f1
|
||||||
FMA2 f3, f18, f23, f3
|
FMA2 f3, f18, f23, f3
|
||||||
|
|
||||||
FMA4 f9, f17, f22, f9
|
FMA4 f1, f17, f22, f1
|
||||||
FMA4 f11, f19, f22, f11
|
FMA4 f3, f19, f22, f3
|
||||||
FMA3 f8, f17, f23, f8
|
FMA3 f0, f17, f23, f0
|
||||||
FMA3 f10, f19, f23, f10
|
FMA3 f2, f19, f23, f2
|
||||||
|
|
||||||
LFD f16, 32 * SIZE(AO)
|
LFD f16, 32 * SIZE(AO)
|
||||||
LFD f17, 33 * SIZE(AO)
|
LFD f17, 33 * SIZE(AO)
|
||||||
|
@ -2573,10 +2548,10 @@ LL(56):
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f18, 6 * SIZE(AO)
|
LFD f18, 6 * SIZE(AO)
|
||||||
|
|
||||||
FMA4 f9, f17, f20, f9
|
FMA4 f1, f17, f20, f1
|
||||||
FMA4 f11, f19, f20, f11
|
FMA4 f3, f19, f20, f3
|
||||||
FMA3 f8, f17, f21, f8
|
FMA3 f0, f17, f21, f0
|
||||||
FMA3 f10, f19, f21, f10
|
FMA3 f2, f19, f21, f2
|
||||||
|
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
LFD f19, 7 * SIZE(AO)
|
LFD f19, 7 * SIZE(AO)
|
||||||
|
@ -2595,27 +2570,22 @@ LL(58):
|
||||||
LFD f18, 2 * SIZE(CO1)
|
LFD f18, 2 * SIZE(CO1)
|
||||||
LFD f19, 3 * SIZE(CO1)
|
LFD f19, 3 * SIZE(CO1)
|
||||||
|
|
||||||
FADD f0, f0, f8
|
fmr f8, f0
|
||||||
FADD f1, f1, f9
|
fmr f9, f1
|
||||||
FADD f2, f2, f10
|
fmr f10, f2
|
||||||
FADD f3, f3, f11
|
fmr f11, f3
|
||||||
|
|
||||||
FNMSUB f24, f31, f1, f16
|
FMADD f24, f30, f0, f16
|
||||||
FMADD f25, f31, f0, f17
|
FMADD f25, f30, f1, f17
|
||||||
FNMSUB f26, f31, f3, f18
|
FMADD f26, f30, f2, f18
|
||||||
FMADD f27, f31, f2, f19
|
FMADD f27, f30, f3, f19
|
||||||
|
|
||||||
FMADD f0, f30, f0, f24
|
FNMSUB f0, f31, f9, f24
|
||||||
FMADD f1, f30, f1, f25
|
FMADD f1, f31, f8, f25
|
||||||
FMADD f2, f30, f2, f26
|
FNMSUB f2, f31, f11, f26
|
||||||
FMADD f3, f30, f3, f27
|
FMADD f3, f31, f10, f27
|
||||||
|
|
||||||
#else
|
#else
|
||||||
FADD f0, f0, f8
|
|
||||||
FADD f1, f1, f9
|
|
||||||
FADD f2, f2, f10
|
|
||||||
FADD f3, f3, f11
|
|
||||||
|
|
||||||
FMUL f16, f31, f1
|
FMUL f16, f31, f1
|
||||||
FMUL f17, f31, f0
|
FMUL f17, f31, f0
|
||||||
FMUL f18, f31, f3
|
FMUL f18, f31, f3
|
||||||
|
@ -2735,9 +2705,9 @@ LL(60):
|
||||||
|
|
||||||
LL(62):
|
LL(62):
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
|
|
||||||
LFD f16, 4 * SIZE(AO)
|
LFD f16, 4 * SIZE(AO)
|
||||||
LFD f17, 5 * SIZE(AO)
|
LFD f17, 5 * SIZE(AO)
|
||||||
|
@ -2745,9 +2715,9 @@ LL(62):
|
||||||
LFD f21, 5 * SIZE(BO)
|
LFD f21, 5 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f0, f18, f22, f0
|
FMA1 f0, f18, f22, f0
|
||||||
FMA4 f3, f19, f22, f3
|
|
||||||
FMA2 f1, f18, f23, f1
|
FMA2 f1, f18, f23, f1
|
||||||
FMA3 f2, f19, f23, f2
|
FMA4 f1, f19, f22, f1
|
||||||
|
FMA3 f0, f19, f23, f0
|
||||||
|
|
||||||
LFD f18, 6 * SIZE(AO)
|
LFD f18, 6 * SIZE(AO)
|
||||||
LFD f19, 7 * SIZE(AO)
|
LFD f19, 7 * SIZE(AO)
|
||||||
|
@ -2755,9 +2725,9 @@ LL(62):
|
||||||
LFD f23, 7 * SIZE(BO)
|
LFD f23, 7 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
|
|
||||||
LFD f16, 8 * SIZE(AO)
|
LFD f16, 8 * SIZE(AO)
|
||||||
LFD f17, 9 * SIZE(AO)
|
LFD f17, 9 * SIZE(AO)
|
||||||
|
@ -2765,9 +2735,9 @@ LL(62):
|
||||||
LFD f21, 9 * SIZE(BO)
|
LFD f21, 9 * SIZE(BO)
|
||||||
|
|
||||||
FMA1 f0, f18, f22, f0
|
FMA1 f0, f18, f22, f0
|
||||||
FMA4 f3, f19, f22, f3
|
|
||||||
FMA2 f1, f18, f23, f1
|
FMA2 f1, f18, f23, f1
|
||||||
FMA3 f2, f19, f23, f2
|
FMA4 f1, f19, f22, f1
|
||||||
|
FMA3 f0, f19, f23, f0
|
||||||
|
|
||||||
LFD f18, 10 * SIZE(AO)
|
LFD f18, 10 * SIZE(AO)
|
||||||
LFD f19, 11 * SIZE(AO)
|
LFD f19, 11 * SIZE(AO)
|
||||||
|
@ -2803,11 +2773,11 @@ LL(65):
|
||||||
|
|
||||||
LL(66):
|
LL(66):
|
||||||
FMA1 f0, f16, f20, f0
|
FMA1 f0, f16, f20, f0
|
||||||
FMA4 f3, f17, f20, f3
|
|
||||||
LFD f20, 2 * SIZE(BO)
|
|
||||||
FMA2 f1, f16, f21, f1
|
FMA2 f1, f16, f21, f1
|
||||||
LFD f16, 2 * SIZE(AO)
|
LFD f16, 2 * SIZE(AO)
|
||||||
FMA3 f2, f17, f21, f2
|
FMA4 f1, f17, f20, f1
|
||||||
|
LFD f20, 2 * SIZE(BO)
|
||||||
|
FMA3 f0, f17, f21, f0
|
||||||
LFD f17, 3 * SIZE(AO)
|
LFD f17, 3 * SIZE(AO)
|
||||||
|
|
||||||
LFD f21, 3 * SIZE(BO)
|
LFD f21, 3 * SIZE(BO)
|
||||||
|
@ -2821,20 +2791,17 @@ LL(68):
|
||||||
LFD f16, 0 * SIZE(CO1)
|
LFD f16, 0 * SIZE(CO1)
|
||||||
LFD f17, 1 * SIZE(CO1)
|
LFD f17, 1 * SIZE(CO1)
|
||||||
|
|
||||||
FADD f0, f0, f2
|
fmr f2, f0
|
||||||
FADD f1, f1, f3
|
fmr f3, f1
|
||||||
|
|
||||||
FNMSUB f24, f31, f1, f16
|
FMADD f24, f30, f0, f16
|
||||||
FMADD f25, f31, f0, f17
|
FMADD f25, f30, f1, f17
|
||||||
|
|
||||||
FMADD f0, f30, f0, f24
|
FNMSUB f0, f31, f3, f24
|
||||||
FMADD f1, f30, f1, f25
|
FMADD f1, f31, f2, f25
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|
||||||
FADD f0, f0, f2
|
|
||||||
FADD f1, f1, f3
|
|
||||||
|
|
||||||
FMUL f16, f31, f1
|
FMUL f16, f31, f1
|
||||||
FMUL f17, f31, f0
|
FMUL f17, f31, f0
|
||||||
|
|
||||||
|
|
|
@ -99,26 +99,26 @@ ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
|
||||||
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||||
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||||
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
SGEMMONCOPYOBJ = sgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
|
||||||
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
|
||||||
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
|
||||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||||
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
CGEMMONCOPYOBJ = cgemm_oncopy.o
|
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
CGEMMOTCOPYOBJ = cgemm_otcopy.o
|
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
|
||||||
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
|
||||||
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
|
||||||
ZGEMMONCOPYOBJ = zgemm_oncopy.o
|
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||||
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
|
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||||
|
|
||||||
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||||
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||||
|
|
|
@ -1244,6 +1244,36 @@ static void init_parameter(void) {
|
||||||
}
|
}
|
||||||
#else //ZARCH
|
#else //ZARCH
|
||||||
|
|
||||||
|
#if (ARCH_RISCV64)
|
||||||
|
static void init_parameter(void) {
|
||||||
|
|
||||||
|
#ifdef BUILD_BFLOAT16
|
||||||
|
TABLE_NAME.sbgemm_p = SBGEMM_DEFAULT_P;
|
||||||
|
#endif
|
||||||
|
TABLE_NAME.sgemm_p = SGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.dgemm_p = DGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.cgemm_p = CGEMM_DEFAULT_P;
|
||||||
|
TABLE_NAME.zgemm_p = ZGEMM_DEFAULT_P;
|
||||||
|
|
||||||
|
#ifdef BUILD_BFLOAT16
|
||||||
|
TABLE_NAME.sbgemm_r = SBGEMM_DEFAULT_R;
|
||||||
|
#endif
|
||||||
|
TABLE_NAME.sgemm_r = SGEMM_DEFAULT_R;
|
||||||
|
TABLE_NAME.dgemm_r = DGEMM_DEFAULT_R;
|
||||||
|
TABLE_NAME.cgemm_r = CGEMM_DEFAULT_R;
|
||||||
|
TABLE_NAME.zgemm_r = ZGEMM_DEFAULT_R;
|
||||||
|
|
||||||
|
|
||||||
|
#ifdef BUILD_BFLOAT16
|
||||||
|
TABLE_NAME.sbgemm_q = SBGEMM_DEFAULT_Q;
|
||||||
|
#endif
|
||||||
|
TABLE_NAME.sgemm_q = SGEMM_DEFAULT_Q;
|
||||||
|
TABLE_NAME.dgemm_q = DGEMM_DEFAULT_Q;
|
||||||
|
TABLE_NAME.cgemm_q = CGEMM_DEFAULT_Q;
|
||||||
|
TABLE_NAME.zgemm_q = ZGEMM_DEFAULT_Q;
|
||||||
|
}
|
||||||
|
#else //RISCV64
|
||||||
|
|
||||||
#ifdef ARCH_X86
|
#ifdef ARCH_X86
|
||||||
static int get_l2_size_old(void){
|
static int get_l2_size_old(void){
|
||||||
int i, eax, ebx, ecx, edx, cpuid_level;
|
int i, eax, ebx, ecx, edx, cpuid_level;
|
||||||
|
@ -2046,6 +2076,7 @@ static void init_parameter(void) {
|
||||||
|
|
||||||
|
|
||||||
}
|
}
|
||||||
|
#endif //RISCV64
|
||||||
#endif //POWER
|
#endif //POWER
|
||||||
#endif //ZARCH
|
#endif //ZARCH
|
||||||
#endif //(ARCH_LOONGARCH64)
|
#endif //(ARCH_LOONGARCH64)
|
||||||
|
|
|
@ -1,6 +1,11 @@
|
||||||
TOPDIR = ../../..
|
TOPDIR = ../../..
|
||||||
include ../../../Makefile.system
|
include ../../../Makefile.system
|
||||||
|
|
||||||
|
ifeq ($(DYNAMIC_ARCH), 1)
|
||||||
|
LASWP = ../generic/laswp_k_4.c
|
||||||
|
ZLASWP = ../generic/zlaswp_k_4.c
|
||||||
|
endif
|
||||||
|
|
||||||
ifndef LASWP
|
ifndef LASWP
|
||||||
LASWP = ../generic/laswp_k.c
|
LASWP = ../generic/laswp_k.c
|
||||||
endif
|
endif
|
||||||
|
|
Loading…
Reference in New Issue