Add support for LOONGARCH64

This commit is contained in:
gxw 2021-07-26 15:44:54 +08:00
parent 5a2fe5bfb9
commit af0a69f355
51 changed files with 24189 additions and 27 deletions

3
Makefile.loongarch64 Normal file
View File

@ -0,0 +1,3 @@
ifdef BINARY64
else
endif

View File

@ -780,6 +780,11 @@ NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
ifeq ($(ARCH), loongarch64)
NO_BINARY_MODE = 1
BINARY_DEFINED = 1
endif
#
# C Compiler dependent settings
@ -850,6 +855,13 @@ ifeq ($(OSNAME), AIX)
BINARY_DEFINED = 1
endif
ifeq ($(ARCH), loongarch64)
ifeq ($(CORE), LOONGSONG3R5)
CCOMMON_OPT += -march=loongarch64 -mabi=lp64
FCOMMON_OPT += -march=loongarch64 -mabi=lp64
endif
endif
endif
ifndef BINARY_DEFINED

View File

@ -110,3 +110,5 @@ Z14
RISCV64_GENERIC
C910V
11.LOONGARCH64:
LOONGSON3R5

View File

@ -94,6 +94,7 @@ $architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = riscv64 if ($data =~ /ARCH_RISCV64/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$defined = 0;
@ -143,6 +144,11 @@ if ($architecture eq "riscv64") {
$binary = 64;
}
if ($architecture eq "loongarch64") {
$defined = 1;
$binary = 64;
}
if ($compiler eq "PGI") {
$compiler_name .= " -tp p7" if ($binary eq "32");
$compiler_name .= " -tp p7-64" if ($binary eq "64");
@ -226,6 +232,7 @@ $architecture = ia64 if ($data =~ /ARCH_IA64/);
$architecture = arm if ($data =~ /ARCH_ARM/);
$architecture = arm64 if ($data =~ /ARCH_ARM64/);
$architecture = zarch if ($data =~ /ARCH_ZARCH/);
$architecture = loongarch64 if ($data =~ /ARCH_LOONGARCH64/);
$binformat = bin32;
$binformat = bin64 if ($data =~ /BINARY_64/);

View File

@ -470,6 +470,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
#include "common_zarch.h"
#endif
#ifdef ARCH_LOONGARCH64
#include "common_loongarch64.h"
#endif
#ifndef ASSEMBLER
#ifdef OS_WINDOWSSTORE
typedef char env_var_t[MAX_PATH];

199
common_loongarch64.h Normal file
View File

@ -0,0 +1,199 @@
/*****************************************************************************
Copyright (c) 2011-2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#ifndef COMMON_LOONGARCH64
#define COMMON_LOONGARCH64
#define MB __sync_synchronize()
#define WMB __sync_synchronize()
#define RMB __sync_synchronize()
#define INLINE inline
#ifndef ASSEMBLER
static inline int blas_quickdivide(blasint x, blasint y){
return x / y;
}
#ifdef DOUBLE
#define GET_IMAGE(res) __asm__ __volatile__("fmov.d %0, $f2" : "=f"(res) : : "memory")
#else
#define GET_IMAGE(res) __asm__ __volatile__("fmov.s %0, $f2" : "=f"(res) : : "memory")
#endif
#define GET_IMAGE_CANCEL
#else
#ifdef DOUBLE
#define LD fld.d
#define ST fst.d
#define MADD fmadd.d
#define NMADD fnmadd.d
#define MSUB fmsub.d
#define NMSUB fnmsub.d
#define ADD fadd.d
#define SUB fsub.d
#define MUL fmul.d
#define MOV fmov.d
#define CMOVT fsel
#define MTC movgr2fr.d
#define FABS fabs.d
#define CMPEQ fcmp.ceq.d
#define CMPLE fcmp.cle.d
#define CMPLT fcmp.clt.d
#define NEG fneg.d
#else
#define LD fld.s
#define ST fst.s
#define MADD fmadd.s
#define NMADD fnmadd.s
#define MSUB fmsub.s
#define NMSUB fnmsub.s
#define ADD fadd.s
#define SUB fsub.s
#define MUL fmul.s
#define MOV fmov.s
#define CMOVT fsel
#define MTC movgr2fr.w
#define FABS fabs.s
#define CMPEQ fcmp.ceq.s
#define CMPLE fcmp.cle.s
#define CMPLT fcmp.clt.s
#define NEG fneg.s
#endif /* defined(DOUBLE) */
#if defined(__64BIT__) && defined(USE64BITINT)
#define LDINT ld.d
#define LDARG ld.d
#define SDARG st.d
#elif defined(__64BIT__) && !defined(USE64BITINT)
#define LDINT ld.w
#define LDARG ld.d
#define SDARG st.d
#else
#define LDINT ld.w
#define LDARG ld.w
#define SDARG st.w
#endif
#ifndef F_INTERFACE
#define REALNAME ASMNAME
#else
#define REALNAME ASMFNAME
#endif /* defined(F_INTERFACE) */
#if defined(ASSEMBLER) && !defined(NEEDPARAM)
#define PROLOGUE \
.text ;\
.align 5 ;\
.globl REALNAME ;\
.type REALNAME, @function ;\
REALNAME: ;\
#if defined(__linux__) && defined(__ELF__)
#define GNUSTACK .section .note.GNU-stack,"",@progbits
#else
#define GNUSTACK
#endif /* defined(__linux__) && defined(__ELF__) */
#define EPILOGUE \
.end REALNAME ;\
GNUSTACK
#define PROFCODE
#define MOVT(dst, src, cc) \
bceqz cc, 1f; \
add.d dst, src, $r0; \
1:
#endif /* defined(ASSEMBLER) && !defined(NEEDPARAM) */
#endif /* defined(ASSEMBLER) */
#define SEEK_ADDRESS
#define BUFFER_SIZE ( 32 << 20)
#define PAGESIZE (16UL << 1)
#define FIXED_PAGESIZE (16UL << 10)
#define HUGE_PAGESIZE ( 2 << 20)
#define BASE_ADDRESS (START_ADDRESS - BUFFER_SIZE * MAX_CPU_NUMBER)
#ifndef MAP_ANONYMOUS
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif

View File

@ -2490,7 +2490,8 @@
#endif
#ifndef ASSEMBLER
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)
#if defined(ARCH_X86) || defined(ARCH_X86_64) || defined(ARCH_IA64) || defined(ARCH_MIPS64) || defined(ARCH_ARM64)\
|| defined(ARCH_LOONGARCH64)
extern BLASLONG gemm_offset_a;
extern BLASLONG gemm_offset_b;
extern BLASLONG sbgemm_p;

110
cpuid_loongarch64.c Normal file
View File

@ -0,0 +1,110 @@
/*****************************************************************************
Copyright (c) 2011-2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written
permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**********************************************************************************/
#include <stdint.h>
#define CPU_UNKNOWN 0
#define CPU_LOONGSON3R5 1
#define LOONGARCH_CFG2 0x02
#define LOONGARCH_LASX 1<<7
static char *cpuname[] = {
"UNKNOWN",
"LOONGSON3R5"
};
int detect(void) {
uint32_t reg = 0;
__asm__ volatile (
"cpucfg %0, %1 \n\t"
: "+&r"(reg)
: "r"(LOONGARCH_CFG2)
);
if (reg & LOONGARCH_LASX)
return CPU_LOONGSON3R5;
else
return CPU_UNKNOWN;
}
char *get_corename(void) {
return cpuname[detect()];
}
void get_architecture(void) {
printf("LOONGARCH64");
}
void get_subarchitecture(void) {
if (detect() == CPU_LOONGSON3R5) {
printf("LOONGSON3R5");
} else {
printf("UNKNOWN");
}
}
void get_subdirname(void) {
printf("loongarch64");
}
void get_cpuconfig(void) {
if (detect() == CPU_LOONGSON3R5) {
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
} else {
printf("#define LOONGSON3R5\n");
printf("#define L1_DATA_SIZE 65536\n");
printf("#define L1_DATA_LINESIZE 64\n");
printf("#define L2_SIZE 1048576\n");
printf("#define L2_LINESIZE 64\n");
printf("#define DTB_DEFAULT_ENTRIES 64\n");
printf("#define DTB_SIZE 4096\n");
printf("#define L2_ASSOCIATIVE 16\n");
}
}
void get_libname(void){
if (detect() == CPU_LOONGSON3R5) {
printf("loongson3r5\n");
} else {
printf("loongarch64\n");
}
}

View File

@ -157,6 +157,10 @@ ARCH_ARM64
ARCH_RISCV64
#endif
#ifdef __loongarch64
ARCH_LOONGARCH64
#endif
#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L)
HAVE_C11
#endif

View File

@ -142,6 +142,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
/* #define FORCE_SICORTEX */
/* #define FORCE_LOONGSON3R3 */
/* #define FORCE_LOONGSON3R4 */
/* #define FORCE_LOONGSON3R5 */
/* #define FORCE_I6400 */
/* #define FORCE_P6600 */
/* #define FORCE_P5600 */
@ -842,6 +843,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
#endif
#ifdef FORCE_LOONGSON3R5
#define FORCE
#define ARCHITECTURE "LOONGARCH"
#define SUBARCHITECTURE "LOONGSON3R5"
#define SUBDIRNAME "loongarch64"
#define ARCHCONFIG "-DLOONGSON3R5 " \
"-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=64 " \
"-DL2_SIZE=1048576 -DL2_LINESIZE=64 " \
"-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=16 "
#define LIBNAME "loongson3r5"
#define CORENAME "LOONGSON3R5"
#else
#endif
#ifdef FORCE_I6400
#define FORCE
#define ARCHITECTURE "MIPS"
@ -1388,6 +1403,11 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define OPENBLAS_SUPPORTED
#endif
#ifdef __loongarch64
#include "cpuid_loongarch64.c"
#define OPENBLAS_SUPPORTED
#endif
#ifdef __riscv
#include "cpuid_riscv64.c"
#define OPENBLAS_SUPPORTED
@ -1463,7 +1483,7 @@ int main(int argc, char *argv[]){
#ifdef FORCE
printf("CORE=%s\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
printf("CORE=%s\n", get_corename());
#endif
#endif
@ -1611,7 +1631,7 @@ printf("ELF_VERSION=2\n");
#ifdef FORCE
printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
#else
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc)
#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__)
printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
#endif
#endif

236
kernel/loongarch64/KERNEL Normal file
View File

@ -0,0 +1,236 @@
ifndef SAXPYKERNEL
SAXPYKERNEL = ../arm/axpy.c
endif
ifndef DAXPYKERNEL
DAXPYKERNEL = ../arm/axpy.c
endif
ifndef CAXPYKERNEL
CAXPYKERNEL = ../arm/zaxpy.c
endif
ifndef ZAXPYKERNEL
ZAXPYKERNEL = ../arm/zaxpy.c
endif
ifndef SROTKERNEL
SROTKERNEL = ../arm/rot.c
endif
ifndef DROTKERNEL
DROTKERNEL = ../arm/rot.c
endif
ifndef CROTKERNEL
CROTKERNEL = ../arm/zrot.c
endif
ifndef ZROTKERNEL
ZROTKERNEL = ../arm/zrot.c
endif
ifndef CSWAPKERNEL
CSWAPKERNEL = ../arm/zswap.c
endif
ifndef ZSWAPKERNEL
ZSWAPKERNEL = ../arm/zswap.c
endif
ifndef SSUMKERNEL
SSUMKERNEL = ../arm/sum.c
endif
ifndef DSUMKERNEL
DSUMKERNEL = ../arm/sum.c
endif
ifndef CSUMKERNEL
CSUMKERNEL = ../arm/zsum.c
endif
ifndef ZSUMKERNEL
ZSUMKERNEL = ../arm/zsum.c
endif
ifndef ISMAXKERNEL
ISMAXKERNEL = ../arm/imax.c
endif
ifndef IDMAXKERNEL
IDMAXKERNEL = ../arm/imax.c
endif
ifndef ISMINKERNEL
ISMINKERNEL = ../arm/imin.c
endif
ifndef IDMINKERNEL
IDMINKERNEL = ../arm/imin.c
endif
ifndef SNRM2KERNEL
SNRM2KERNEL = snrm2.S
endif
ifndef DNRM2KERNEL
DNRM2KERNEL = dnrm2.S
endif
ifndef CNRM2KERNEL
CNRM2KERNEL = cnrm2.S
endif
ifndef ZNRM2KERNEL
ZNRM2KERNEL = znrm2.S
endif
ifndef SCABS_KERNEL
SCABS_KERNEL = ../generic/cabs.c
endif
ifndef DCABS_KERNEL
DCABS_KERNEL = ../generic/cabs.c
endif
ifndef QCABS_KERNEL
QCABS_KERNEL = ../generic/cabs.c
endif
ifndef LSAME_KERNEL
LSAME_KERNEL = ../generic/lsame.c
endif
ifndef SGEMMKERNEL
SGEMMKERNEL = gemm_kernel.S
SGEMMINCOPY = ../generic/gemm_ncopy_2.c
SGEMMITCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPY = ../generic/gemm_ncopy_8.c
SGEMMOTCOPY = ../generic/gemm_tcopy_8.c
SGEMMINCOPYOBJ = sgemm_incopy.o
SGEMMITCOPYOBJ = sgemm_itcopy.o
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
endif
ifndef DGEMMKERNEL
DGEMMKERNEL = gemm_kernel.S
DGEMMINCOPY = ../generic/gemm_ncopy_2.c
DGEMMITCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPY = ../generic/gemm_ncopy_8.c
DGEMMOTCOPY = ../generic/gemm_tcopy_8.c
DGEMMINCOPYOBJ = dgemm_incopy.o
DGEMMITCOPYOBJ = dgemm_itcopy.o
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
endif
ifndef CGEMMKERNEL
CGEMMKERNEL = zgemm_kernel.S
CGEMMINCOPY = ../generic/zgemm_ncopy_1.c
CGEMMITCOPY = ../generic/zgemm_tcopy_1.c
CGEMMONCOPY = ../generic/zgemm_ncopy_4.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
CGEMMINCOPYOBJ = cgemm_incopy.o
CGEMMITCOPYOBJ = cgemm_itcopy.o
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
endif
ifndef ZGEMMKERNEL
ZGEMMKERNEL = zgemm_kernel.S
ZGEMMINCOPY = ../generic/zgemm_ncopy_1.c
ZGEMMITCOPY = ../generic/zgemm_tcopy_1.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_4.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_4.c
ZGEMMINCOPYOBJ = zgemm_incopy.o
ZGEMMITCOPYOBJ = zgemm_itcopy.o
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
endif
ifndef SGEMM_BETA
SGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef DGEMM_BETA
DGEMM_BETA = ../generic/gemm_beta.c
endif
ifndef CGEMM_BETA
CGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef ZGEMM_BETA
ZGEMM_BETA = ../generic/zgemm_beta.c
endif
ifndef STRSMKERNEL_LN
STRSMKERNEL_LN = trsm_kernel_LN.S
endif
ifndef STRSMKERNEL_LT
STRSMKERNEL_LT = trsm_kernel_LT.S
endif
ifndef STRSMKERNEL_RN
STRSMKERNEL_RN = trsm_kernel_LT.S
endif
ifndef STRSMKERNEL_RT
STRSMKERNEL_RT = trsm_kernel_RT.S
endif
ifndef DTRSMKERNEL_LN
DTRSMKERNEL_LN = trsm_kernel_LN.S
endif
ifndef DTRSMKERNEL_LT
DTRSMKERNEL_LT = trsm_kernel_LT.S
endif
ifndef DTRSMKERNEL_RN
DTRSMKERNEL_RN = trsm_kernel_LT.S
endif
ifndef DTRSMKERNEL_RT
DTRSMKERNEL_RT = trsm_kernel_RT.S
endif
ifndef CTRSMKERNEL_LN
CTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_LT
CTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RN
CTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef CTRSMKERNEL_RT
CTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef ZTRSMKERNEL_LN
ZTRSMKERNEL_LN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_LT
ZTRSMKERNEL_LT = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RN
ZTRSMKERNEL_RN = ztrsm_kernel_LT.S
endif
ifndef ZTRSMKERNEL_RT
ZTRSMKERNEL_RT = ztrsm_kernel_RT.S
endif
ifndef CGEMM3MKERNEL
CGEMM3MKERNEL = zgemm3m_kernel.S
endif
ifndef ZGEMM3MKERNEL
ZGEMM3MKERNEL = zgemm3m_kernel.S
endif

View File

@ -0,0 +1 @@
#TODO: Add loongarch64 SIMD optimizations

View File

@ -0,0 +1,167 @@
SGEMM_BETA = ../generic/gemm_beta.c
DGEMM_BETA = ../generic/gemm_beta.c
CGEMM_BETA = ../generic/zgemm_beta.c
ZGEMM_BETA = ../generic/zgemm_beta.c
STRMMKERNEL = ../generic/trmmkernel_2x2.c
DTRMMKERNEL = ../generic/trmmkernel_2x2.c
CTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
ZTRMMKERNEL = ../generic/ztrmmkernel_2x2.c
SGEMMKERNEL = ../generic/gemmkernel_2x2.c
SGEMMONCOPY = ../generic/gemm_ncopy_2.c
SGEMMOTCOPY = ../generic/gemm_tcopy_2.c
SGEMMONCOPYOBJ = sgemm_oncopy.o
SGEMMOTCOPYOBJ = sgemm_otcopy.o
DGEMMKERNEL = ../generic/gemmkernel_2x2.c
DGEMMONCOPY = ../generic/gemm_ncopy_2.c
DGEMMOTCOPY = ../generic/gemm_tcopy_2.c
DGEMMONCOPYOBJ = dgemm_oncopy.o
DGEMMOTCOPYOBJ = dgemm_otcopy.o
CGEMMKERNEL = ../generic/zgemmkernel_2x2.c
CGEMMONCOPY = ../generic/zgemm_ncopy_2.c
CGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
CGEMMONCOPYOBJ = cgemm_oncopy.o
CGEMMOTCOPYOBJ = cgemm_otcopy.o
ZGEMMKERNEL = ../generic/zgemmkernel_2x2.c
ZGEMMONCOPY = ../generic/zgemm_ncopy_2.c
ZGEMMOTCOPY = ../generic/zgemm_tcopy_2.c
ZGEMMONCOPYOBJ = zgemm_oncopy.o
ZGEMMOTCOPYOBJ = zgemm_otcopy.o
STRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
STRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
STRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
STRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
ZTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
#Pure C for other kernels
SAMAXKERNEL = ../arm/amax.c
DAMAXKERNEL = ../arm/amax.c
CAMAXKERNEL = ../arm/zamax.c
ZAMAXKERNEL = ../arm/zamax.c
SAMINKERNEL = ../arm/amin.c
DAMINKERNEL = ../arm/amin.c
CAMINKERNEL = ../arm/zamin.c
ZAMINKERNEL = ../arm/zamin.c
SMAXKERNEL = ../arm/max.c
DMAXKERNEL = ../arm/max.c
SMINKERNEL = ../arm/min.c
DMINKERNEL = ../arm/min.c
ISAMAXKERNEL = ../arm/iamax.c
IDAMAXKERNEL = ../arm/iamax.c
ICAMAXKERNEL = ../arm/izamax.c
IZAMAXKERNEL = ../arm/izamax.c
ISAMINKERNEL = ../arm/iamin.c
IDAMINKERNEL = ../arm/iamin.c
ICAMINKERNEL = ../arm/izamin.c
IZAMINKERNEL = ../arm/izamin.c
ISMAXKERNEL = ../arm/imax.c
IDMAXKERNEL = ../arm/imax.c
ISMINKERNEL = ../arm/imin.c
IDMINKERNEL = ../arm/imin.c
SASUMKERNEL = ../arm/asum.c
DASUMKERNEL = ../arm/asum.c
CASUMKERNEL = ../arm/zasum.c
ZASUMKERNEL = ../arm/zasum.c
SSUMKERNEL = ../arm/sum.c
DSUMKERNEL = ../arm/sum.c
CSUMKERNEL = ../arm/zsum.c
ZSUMKERNEL = ../arm/zsum.c
SAXPYKERNEL = ../arm/axpy.c
DAXPYKERNEL = ../arm/axpy.c
CAXPYKERNEL = ../arm/zaxpy.c
ZAXPYKERNEL = ../arm/zaxpy.c
SCOPYKERNEL = ../arm/copy.c
DCOPYKERNEL = ../arm/copy.c
CCOPYKERNEL = ../arm/zcopy.c
ZCOPYKERNEL = ../arm/zcopy.c
SDOTKERNEL = ../generic/dot.c
DDOTKERNEL = ../arm/dot.c
CDOTKERNEL = ../arm/zdot.c
ZDOTKERNEL = ../arm/zdot.c
SNRM2KERNEL = ../arm/nrm2.c
DNRM2KERNEL = ../arm/nrm2.c
CNRM2KERNEL = ../arm/znrm2.c
ZNRM2KERNEL = ../arm/znrm2.c
SROTKERNEL = ../arm/rot.c
DROTKERNEL = ../arm/rot.c
CROTKERNEL = ../arm/zrot.c
ZROTKERNEL = ../arm/zrot.c
SSCALKERNEL = ../arm/scal.c
DSCALKERNEL = ../arm/scal.c
CSCALKERNEL = ../arm/zscal.c
ZSCALKERNEL = ../arm/zscal.c
SSWAPKERNEL = ../arm/swap.c
DSWAPKERNEL = ../arm/swap.c
CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
DGEMVNKERNEL = ../arm/gemv_n.c
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
DGEMVTKERNEL = ../arm/gemv_t.c
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c
SSYMV_U_KERNEL = ../generic/symv_k.c
SSYMV_L_KERNEL = ../generic/symv_k.c
DSYMV_U_KERNEL = ../generic/symv_k.c
DSYMV_L_KERNEL = ../generic/symv_k.c
QSYMV_U_KERNEL = ../generic/symv_k.c
QSYMV_L_KERNEL = ../generic/symv_k.c
CSYMV_U_KERNEL = ../generic/zsymv_k.c
CSYMV_L_KERNEL = ../generic/zsymv_k.c
ZSYMV_U_KERNEL = ../generic/zsymv_k.c
ZSYMV_L_KERNEL = ../generic/zsymv_k.c
XSYMV_U_KERNEL = ../generic/zsymv_k.c
XSYMV_L_KERNEL = ../generic/zsymv_k.c
ZHEMV_U_KERNEL = ../generic/zhemv_k.c
ZHEMV_L_KERNEL = ../generic/zhemv_k.c
LSAME_KERNEL = ../generic/lsame.c
SCABS_KERNEL = ../generic/cabs.c
DCABS_KERNEL = ../generic/cabs.c
QCABS_KERNEL = ../generic/cabs.c
#Dump kernel
CGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c
ZGEMM3MKERNEL = ../generic/zgemm3mkernel_dump.c

View File

@ -0,0 +1 @@
clean ::

230
kernel/loongarch64/amax.S Normal file
View File

@ -0,0 +1,230 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
FABS s1, a1
FABS s2, a1
bge $r0, N, .L999
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

186
kernel/loongarch64/amin.S Normal file
View File

@ -0,0 +1,186 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
FABS s1, a1
FABS s2, a1
bge $r0, N, .L999
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a3, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a4, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 7
NOP
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

232
kernel/loongarch64/asum.S Normal file
View File

@ -0,0 +1,232 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f23
#define a2 $f9
#define a3 $f10
#define a4 $f11
#define a5 $f12
#define a6 $f13
#define a7 $f14
#define a8 $f15
#define t1 $f16
#define t2 $f17
#define t3 $f0
#define t4 $f1
#define s1 $f22
#define s2 $f8
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
MTC s2, $r0
slli.d INCX, INCX, BASE_SHIFT
li TEMP, SIZE
bge $r0, N, .L999
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
FABS t1, a1
LD a6, X, 5 * SIZE
FABS t2, a2
LD a7, X, 6 * SIZE
FABS t3, a3
FABS t4, a4
addi.d I, I, -1
LD a8, X, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ADD s1, s1, t1
LD a1, X, 8 * SIZE
FABS t1, a5
addi.d I, I, -1
ADD s2, s2, t2
LD a2, X, 9 * SIZE
FABS t2, a6
NOP
ADD s1, s1, t3
LD a3, X, 10 * SIZE
FABS t3, a7
NOP
ADD s2, s2, t4
LD a4, X, 11 * SIZE
FABS t4, a8
addi.d X, X, 8 * SIZE
ADD s1, s1, t1
LD a5, X, 4 * SIZE
FABS t1, a1
NOP
ADD s2, s2, t2
LD a6, X, 5 * SIZE
FABS t2, a2
NOP
ADD s1, s1, t3
LD a7, X, 6 * SIZE
FABS t3, a3
NOP
ADD s2, s2, t4
LD a8, X, 7 * SIZE
FABS t4, a4
blt $r0, I, .L12
.align 3
.L13:
ADD s1, s1, t1
addi.d X, X, 8 * SIZE
FABS t1, a5
NOP
ADD s2, s2, t2
FABS t2, a6
ADD s1, s1, t3
FABS t3, a7
ADD s2, s2, t4
FABS t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
ADD s1, s1, t1
addi.d X, X, SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
bge $r0, I, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
FABS t1, a1
LD a7, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a8, X, 0 * SIZE
FABS t4, a4
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
ADD s1, s1, t1
LD a1, X, 0 * SIZE
FABS t1, a5
add.d X, X, INCX
ADD s2, s2, t2
LD a2, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
ADD s1, s1, t3
LD a3, X, 0 * SIZE
FABS t3, a7
add.d X, X, INCX
ADD s2, s2, t4
LD a4, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
ADD s1, s1, t1
LD a5, X, 0 * SIZE
FABS t1, a1
add.d X, X, INCX
ADD s2, s2, t2
LD a6, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
ADD s1, s1, t3
LD a7, X, 0 * SIZE
FABS t3, a3
add.d X, X, INCX
ADD s2, s2, t4
LD a8, X, 0 * SIZE
FABS t4, a4
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L23
.align 3
.L24:
ADD s1, s1, t1
FABS t1, a5
ADD s2, s2, t2
FABS t2, a6
ADD s1, s1, t3
FABS t3, a7
ADD s2, s2, t4
FABS t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
add.d X, X, INCX
ADD s1, s1, t1
blt $r0, I, .L26
.align 3
.L999:
ADD s1, s1, s2
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

159
kernel/loongarch64/cnrm2.S Normal file
View File

@ -0,0 +1,159 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define a5 $f16
#define a6 $f17
#define a7 $f0
#define a8 $f1
#define s1 $f22
#define s2 $f8
#define t1 $f23
#define t2 $f9
#define t3 $f10
#define t4 $f11
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
movgr2fr.d s1, $r0
li TEMP, 2 * SIZE
fmov.d s2, s1
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
srai.d I, N, 2
bge $r0, I, .L25
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
fcvt.d.s t1, a1
LD a7, X, 0 * SIZE
fcvt.d.s t2, a2
LD a8, X, 1 * SIZE
fcvt.d.s t3, a3
addi.d I, I, -1
fcvt.d.s t4, a4
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
fmadd.d s1, t1, t1, s1
LD a1, X, 0 * SIZE
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
LD a2, X, 1 * SIZE
fcvt.d.s t2, a6
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a3, X, 0 * SIZE
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
LD a4, X, 1 * SIZE
fcvt.d.s t4, a8
add.d X, X, INCX
fmadd.d s1, t1, t1, s1
LD a5, X, 0 * SIZE
fcvt.d.s t1, a1
addi.d I, I, -1
fmadd.d s2, t2, t2, s2
LD a6, X, 1 * SIZE
fcvt.d.s t2, a2
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a7, X, 0 * SIZE
fcvt.d.s t3, a3
LD a8, X, 1 * SIZE
fmadd.d s2, t4, t4, s2
add.d X, X, INCX
fcvt.d.s t4, a4
blt $r0, I, .L23
.align 3
.L24:
fmadd.d s1, t1, t1, s1
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
fcvt.d.s t2, a6
fmadd.d s1, t3, t3, s1
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
fcvt.d.s t4, a8
fmadd.d s1, t1, t1, s1
fmadd.d s2, t2, t2, s2
fmadd.d s1, t3, t3, s1
fmadd.d s2, t4, t4, s2
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
fcvt.d.s t2, a2
fmadd.d s1, t1, t1, s1
add.d X, X, INCX
fmadd.d s2, t2, t2, s2
blt $r0, I, .L26
.align 3
.L999:
fadd.d s1, s1, s2
fsqrt.d s1, s1
move $r4, $r17
fcvt.s.d $f0, s1
jirl $r0, $r1, 0x0
EPILOGUE

225
kernel/loongarch64/copy.S Normal file
View File

@ -0,0 +1,225 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
li TEMP, SIZE
NOP
slli.d INCX, INCX, BASE_SHIFT
bge $r0, N, .L999
slli.d INCY, INCY, BASE_SHIFT
bne INCX, TEMP, .L20
srai.d I, N, 3
bne INCY, TEMP, .L20
addi.d I, I, -1
blt I, $r0, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
LD a6, X, 5 * SIZE
LD a7, X, 6 * SIZE
LD a8, X, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ST a1, Y, 0 * SIZE
LD a1, X, 8 * SIZE
ST a2, Y, 1 * SIZE
LD a2, X, 9 * SIZE
ST a3, Y, 2 * SIZE
LD a3, X, 10 * SIZE
ST a4, Y, 3 * SIZE
LD a4, X, 11 * SIZE
ST a5, Y, 4 * SIZE
LD a5, X, 12 * SIZE
ST a6, Y, 5 * SIZE
LD a6, X, 13 * SIZE
ST a7, Y, 6 * SIZE
LD a7, X, 14 * SIZE
ST a8, Y, 7 * SIZE
LD a8, X, 15 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
ST a3, Y, 2 * SIZE
ST a4, Y, 3 * SIZE
ST a5, Y, 4 * SIZE
ST a6, Y, 5 * SIZE
ST a7, Y, 6 * SIZE
ST a8, Y, 7 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d X, X, SIZE
addi.d I, I, -1
addi.d Y, Y, SIZE
ST a1, Y, -1 * SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
srai.d I, N, 3
addi.d I, I, -1
blt I, $r0, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
add.d X, X, INCX
bge $r0, I, .L23
.align 3
.L22:
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
LD a1, X, 0 * SIZE
add.d X, X, INCX
ST a2, Y, 0 * SIZE
add.d Y, Y, INCY
LD a2, X, 0 * SIZE
add.d X, X, INCX
ST a3, Y, 0 * SIZE
add.d Y, Y, INCY
LD a3, X, 0 * SIZE
add.d X, X, INCX
ST a4, Y, 0 * SIZE
add.d Y, Y, INCY
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST a5, Y, 0 * SIZE
add.d Y, Y, INCY
LD a5, X, 0 * SIZE
add.d X, X, INCX
ST a6, Y, 0 * SIZE
add.d Y, Y, INCY
LD a6, X, 0 * SIZE
add.d X, X, INCX
ST a7, Y, 0 * SIZE
add.d Y, Y, INCY
LD a7, X, 0 * SIZE
add.d X, X, INCX
ST a8, Y, 0 * SIZE
add.d Y, Y, INCY
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L22
.align 3
.L23:
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
ST a2, Y, 0 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
add.d Y, Y, INCY
ST a4, Y, 0 * SIZE
add.d Y, Y, INCY
ST a5, Y, 0 * SIZE
add.d Y, Y, INCY
ST a6, Y, 0 * SIZE
add.d Y, Y, INCY
ST a7, Y, 0 * SIZE
add.d Y, Y, INCY
ST a8, Y, 0 * SIZE
add.d Y, Y, INCY
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
addi.d I, I, -1
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L26
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

314
kernel/loongarch64/dnrm2.S Normal file
View File

@ -0,0 +1,314 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define XX $r7
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define ALPHA $f4
#define max $f5
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
move XX, X
NOP
LD a1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
FABS s1, a1
FABS s2, a1
bge $r0, N, .L999
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L100
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
FABS t1, a1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L100:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
addi.d N, N, 1
lu12i.w TEMP, 0x3f800
movgr2fr.d a1, $r0
movgr2fr.w ALPHA, TEMP
CMPEQ $fcc0, s1, a1
fcvt.d.s ALPHA, ALPHA
bcnez $fcc0, .L999
fdiv.d ALPHA, ALPHA, s1
MOV max, s1
MOV s1, a1
MOV s2, a1
MOV s3, a1
MOV s4, a1
srai.d I, N, 3
bge $r0, I, .L105
LD a1, XX, 0 * SIZE
add.d XX, XX, INCX
LD a2, XX, 0 * SIZE
add.d XX, XX, INCX
LD a3, XX, 0 * SIZE
add.d XX, XX, INCX
LD a4, XX, 0 * SIZE
add.d XX, XX, INCX
LD a5, XX, 0 * SIZE
add.d XX, XX, INCX
LD a6, XX, 0 * SIZE
add.d XX, XX, INCX
LD a7, XX, 0 * SIZE
add.d XX, XX, INCX
LD a8, XX, 0 * SIZE
addi.d I, I, -1
add.d XX, XX, INCX
bge $r0, I, .L104
.align 3
.L103:
MUL t1, ALPHA, a1
LD a1, XX, 0 * SIZE
MUL t2, ALPHA, a2
add.d XX, XX, INCX
MUL t3, ALPHA, a3
LD a2, XX, 0 * SIZE
MUL t4, ALPHA, a4
add.d XX, XX, INCX
MADD s1, t1, t1, s1
LD a3, XX, 0 * SIZE
MADD s2, t2, t2, s2
add.d XX, XX, INCX
MADD s3, t3, t3, s3
LD a4, XX, 0 * SIZE
MADD s4, t4, t4, s4
add.d XX, XX, INCX
MUL t1, ALPHA, a5
LD a5, XX, 0 * SIZE
MUL t2, ALPHA, a6
add.d XX, XX, INCX
MUL t3, ALPHA, a7
LD a6, XX, 0 * SIZE
MUL t4, ALPHA, a8
add.d XX, XX, INCX
MADD s1, t1, t1, s1
LD a7, XX, 0 * SIZE
MADD s2, t2, t2, s2
add.d XX, XX, INCX
MADD s3, t3, t3, s3
LD a8, XX, 0 * SIZE
MADD s4, t4, t4, s4
addi.d I, I, -1
add.d XX, XX, INCX
blt $r0, I, .L103
.align 3
.L104:
MUL t1, ALPHA, a1
MUL t2, ALPHA, a2
MUL t3, ALPHA, a3
MUL t4, ALPHA, a4
MADD s1, t1, t1, s1
MADD s2, t2, t2, s2
MADD s3, t3, t3, s3
MADD s4, t4, t4, s4
MUL t1, ALPHA, a5
MUL t2, ALPHA, a6
MUL t3, ALPHA, a7
MUL t4, ALPHA, a8
MADD s1, t1, t1, s1
MADD s2, t2, t2, s2
MADD s3, t3, t3, s3
MADD s4, t4, t4, s4
.align 3
.L105:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L106:
LD a1, XX, 0 * SIZE
addi.d I, I, -1
MUL t1, ALPHA, a1
add.d XX, XX, INCX
MADD s1, t1, t1, s1
blt $r0, I, .L106
.align 3
.L998:
ADD s1, s1, s2
ADD s3, s3, s4
ADD s1, s1, s3
fsqrt.d s1, s1
move $r4, $r17
MUL $f0, max, s1
jirl $r0, $r1, 0x0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

391
kernel/loongarch64/dot.S Normal file
View File

@ -0,0 +1,391 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f23
#define a2 $f9
#define a3 $f10
#define a4 $f11
#define b1 $f12
#define b2 $f13
#define b3 $f14
#define b4 $f15
#define s1 $f22
#define s2 $f8
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
MTC s1, $r0
MTC s2, $r0
slli.d INCX, INCX, BASE_SHIFT
li TEMP, SIZE
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
srai.d I, N, 3
bne INCX, TEMP, .L20
bne INCY, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
LD a2, X, 1 * SIZE
LD b2, Y, 1 * SIZE
LD a3, X, 2 * SIZE
LD b3, Y, 2 * SIZE
LD a4, X, 3 * SIZE
addi.d I, I, -1
LD b4, Y, 3 * SIZE
bge $r0, I, .L13
.align 3
.L12:
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 4 * SIZE
LD b1, Y, 4 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
LD a2, X, 5 * SIZE
LD b2, Y, 5 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
LD a3, X, 6 * SIZE
LD b3, Y, 6 * SIZE
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
LD a4, X, 7 * SIZE
LD b4, Y, 7 * SIZE
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 8 * SIZE
LD b1, Y, 8 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
LD a2, X, 9 * SIZE
LD b2, Y, 9 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
LD a3, X, 10 * SIZE
LD b3, Y, 10 * SIZE
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
LD a4, X, 11 * SIZE
LD b4, Y, 11 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 4 * SIZE
LD b1, Y, 4 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
LD a2, X, 5 * SIZE
LD b2, Y, 5 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
LD a3, X, 6 * SIZE
LD b3, Y, 6 * SIZE
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
LD a4, X, 7 * SIZE
LD b4, Y, 7 * SIZE
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d X, X, 8 * SIZE
#ifdef DSDOT
fcvt.d.s a2, a2
fcvt.d.s b2, b2
fmadd.d s2, b2, a2, s2
#else
MADD s2, b2, a2, s2
#endif
addi.d Y, Y, 8 * SIZE
#ifdef DSDOT
fcvt.d.s a3, a3
fcvt.d.s b3, b3
fmadd.d s1, b3, a3, s1
#else
MADD s1, b3, a3, s1
#endif
#ifdef DSDOT
fcvt.d.s a4, a4
fcvt.d.s b4, b4
fmadd.d s2, b4, a4, s2
#else
MADD s2, b4, a4, s2
#endif
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
addi.d I, I, -1
addi.d X, X, SIZE
addi.d Y, Y, SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
#ifdef F_INTERFACE
bgez INCX, .L21
addi.d TEMP, N, -1
mult TEMP, INCX
mflo TEMP
dsub X, X, TEMP
.align 3
.L21:
bgez INCY, .L22
addi.d TEMP, N, -1
mult TEMP, INCY
mflo TEMP
dsub Y, Y, TEMP
.align 3
.L22:
#endif
bge $r0, I, .L25
.align 3
.L23:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s2, b1, a1, s2
#else
MADD s2, b1, a1, s2
#endif
blt $r0, I, .L23
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
#ifdef DSDOT
fcvt.d.s a1, a1
fcvt.d.s b1, b1
fmadd.d s1, b1, a1, s1
#else
MADD s1, b1, a1, s1
#endif
blt $r0, I, .L26
.align 3
.L999:
#ifdef DSDOT
fadd.d $f0, s1, s2
#else
ADD $f0, s1, s2
#endif
move $r4, $r17
jirl $r0, $r1, 0x0
EPILOGUE

File diff suppressed because it is too large Load Diff

531
kernel/loongarch64/gemv_n.S Normal file
View File

@ -0,0 +1,531 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Unused param dummy1 */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define YORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA $f0
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define x1 $f14
#define x2 $f15
#define y1 $f16
#define y2 $f17
#define y3 $f3
#define y4 $f1
#define y5 $f2
#define y6 $f4
#define y7 $f5
#define y8 $f6
#define t1 $f7
#define t2 $f18
#define t3 $f19
#define t4 $f20
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifdef __64BIT__
addi.d $sp, $sp, -16
#else
addi.d $sp, $sp, -48
#endif
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
slli.d LDA, LDA, BASE_SHIFT
#ifndef __64BIT__
fst.d $f18, $sp, 16
fst.d $f19, $sp, 24
fst.d $f20, $sp, 32
#endif
slli.d INCX, INCX, BASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
li I, SIZE
move YORIG, Y
beq INCY, I, .L10
srai.d I, M, 2
move YORIG, BUFFER
move XX, Y
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, XX, 0 * SIZE
add.d XX, XX, INCY
LD a2, XX, 0 * SIZE
add.d XX, XX, INCY
LD a3, XX, 0 * SIZE
add.d XX, XX, INCY
LD a4, XX, 0 * SIZE
add.d XX, XX, INCY
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
ST a3, YY, 2 * SIZE
ST a4, YY, 3 * SIZE
addi.d I, I, -1
addi.d YY, YY, 4 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, XX, 0 * SIZE
add.d XX, XX, INCY
ST a1, YY, 0 * SIZE
addi.d I, I, -1
addi.d YY, YY, 1 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
bge $r0, J, .L20
.align 3
.L11:
LD x1, X, 0 * SIZE
add.d X, X, INCX
LD x2, X, 0 * SIZE
add.d X, X, INCX
move AO1, A
add.d AO2, A, LDA
add.d A, AO2, LDA
move YY, YORIG
MUL x1, ALPHA, x1
srai.d I, M, 3
MUL x2, ALPHA, x2
bge $r0, I, .L15
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
LD a5, AO2, 0 * SIZE
LD y5, YY, 4 * SIZE
LD a6, AO2, 1 * SIZE
LD y6, YY, 5 * SIZE
LD a7, AO2, 2 * SIZE
LD y7, YY, 6 * SIZE
LD a8, AO2, 3 * SIZE
addi.d I, I, -1
LD y8, YY, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
LD y1, YY, 8 * SIZE
LD y2, YY, 9 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
LD y3, YY, 10 * SIZE
LD y4, YY, 11 * SIZE
MADD t1, a5, x2, t1
LD a5, AO2, 4 * SIZE
MADD t2, a6, x2, t2
LD a6, AO2, 5 * SIZE
MADD t3, a7, x2, t3
LD a7, AO2, 6 * SIZE
MADD t4, a8, x2, t4
LD a8, AO2, 7 * SIZE
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
ST t3, YY, 2 * SIZE
ST t4, YY, 3 * SIZE
MADD t1, a1, x1, y5
LD a1, AO1, 8 * SIZE
MADD t2, a2, x1, y6
LD a2, AO1, 9 * SIZE
LD y5, YY, 12 * SIZE
LD y6, YY, 13 * SIZE
MADD t3, a3, x1, y7
LD a3, AO1, 10 * SIZE
MADD t4, a4, x1, y8
LD a4, AO1, 11 * SIZE
LD y7, YY, 14 * SIZE
LD y8, YY, 15 * SIZE
MADD t1, a5, x2, t1
LD a5, AO2, 8 * SIZE
MADD t2, a6, x2, t2
LD a6, AO2, 9 * SIZE
MADD t3, a7, x2, t3
LD a7, AO2, 10 * SIZE
MADD t4, a8, x2, t4
LD a8, AO2, 11 * SIZE
ST t1, YY, 4 * SIZE
ST t2, YY, 5 * SIZE
ST t3, YY, 6 * SIZE
ST t4, YY, 7 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d AO2, AO2, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
MADD t1, a5, x2, t1
LD a5, AO2, 4 * SIZE
MADD t2, a6, x2, t2
LD a6, AO2, 5 * SIZE
MADD t3, a7, x2, t3
LD a7, AO2, 6 * SIZE
MADD t4, a8, x2, t4
LD a8, AO2, 7 * SIZE
ST t1, YY, 0 * SIZE
MADD t1, a1, x1, y5
ST t2, YY, 1 * SIZE
MADD t2, a2, x1, y6
ST t3, YY, 2 * SIZE
MADD t3, a3, x1, y7
ST t4, YY, 3 * SIZE
MADD t4, a4, x1, y8
MADD t1, a5, x2, t1
addi.d AO1, AO1, 8 * SIZE
MADD t2, a6, x2, t2
addi.d AO2, AO2, 8 * SIZE
MADD t3, a7, x2, t3
addi.d YY, YY, 8 * SIZE
MADD t4, a8, x2, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L15:
andi I, M, 4
bge $r0, I, .L16
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
LD a5, AO2, 0 * SIZE
MADD y1, a1, x1, y1
LD a6, AO2, 1 * SIZE
MADD y2, a2, x1, y2
LD a7, AO2, 2 * SIZE
MADD y3, a3, x1, y3
LD a8, AO2, 3 * SIZE
MADD y4, a4, x1, y4
MADD y1, a5, x2, y1
addi.d YY, YY, 4 * SIZE
MADD y2, a6, x2, y2
addi.d AO1, AO1, 4 * SIZE
MADD y3, a7, x2, y3
addi.d AO2, AO2, 4 * SIZE
MADD y4, a8, x2, y4
ST y1, YY, -4 * SIZE
ST y2, YY, -3 * SIZE
ST y3, YY, -2 * SIZE
ST y4, YY, -1 * SIZE
.align 3
.L16:
andi I, M, 2
bge $r0, I, .L17
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a5, AO2, 0 * SIZE
LD a6, AO2, 1 * SIZE
MADD y1, a1, x1, y1
MADD y2, a2, x1, y2
addi.d YY, YY, 2 * SIZE
MADD y1, a5, x2, y1
addi.d AO1, AO1, 2 * SIZE
MADD y2, a6, x2, y2
addi.d AO2, AO2, 2 * SIZE
ST y1, YY, -2 * SIZE
ST y2, YY, -1 * SIZE
.align 3
.L17:
andi I, M, 1
bge $r0, I, .L19
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD a5, AO2, 0 * SIZE
MADD y1, a1, x1, y1
MADD y1, a5, x2, y1
ST y1, YY, 0 * SIZE
.align 3
.L19:
addi.d J, J, -1
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
bge $r0, J, .L900
.align 3
.L21:
LD x1, X, 0 * SIZE
add.d X, X, INCX
move YY, YORIG
move AO1, A
srai.d I, M, 3
MUL x1, ALPHA, x1
bge $r0, I, .L25
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
LD y5, YY, 4 * SIZE
LD y6, YY, 5 * SIZE
LD y7, YY, 6 * SIZE
addi.d I, I, -1
LD y8, YY, 7 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
LD y1, YY, 8 * SIZE
LD y2, YY, 9 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
LD y3, YY, 10 * SIZE
LD y4, YY, 11 * SIZE
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
ST t3, YY, 2 * SIZE
ST t4, YY, 3 * SIZE
MADD t1, a1, x1, y5
LD a1, AO1, 8 * SIZE
MADD t2, a2, x1, y6
LD a2, AO1, 9 * SIZE
LD y5, YY, 12 * SIZE
LD y6, YY, 13 * SIZE
MADD t3, a3, x1, y7
LD a3, AO1, 10 * SIZE
MADD t4, a4, x1, y8
LD a4, AO1, 11 * SIZE
LD y7, YY, 14 * SIZE
LD y8, YY, 15 * SIZE
ST t1, YY, 4 * SIZE
ST t2, YY, 5 * SIZE
ST t3, YY, 6 * SIZE
ST t4, YY, 7 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
MADD t1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD t2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD t3, a3, x1, y3
LD a3, AO1, 6 * SIZE
MADD t4, a4, x1, y4
LD a4, AO1, 7 * SIZE
ST t1, YY, 0 * SIZE
MADD t1, a1, x1, y5
ST t2, YY, 1 * SIZE
MADD t2, a2, x1, y6
ST t3, YY, 2 * SIZE
MADD t3, a3, x1, y7
ST t4, YY, 3 * SIZE
MADD t4, a4, x1, y8
ST t1, YY, 4 * SIZE
ST t2, YY, 5 * SIZE
ST t3, YY, 6 * SIZE
ST t4, YY, 7 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d YY, YY, 8 * SIZE
.align 3
.L25:
andi I, M, 4
bge $r0, I, .L26
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
MADD y1, a1, x1, y1
MADD y2, a2, x1, y2
MADD y3, a3, x1, y3
addi.d YY, YY, 4 * SIZE
MADD y4, a4, x1, y4
addi.d AO1, AO1, 4 * SIZE
ST y1, YY, -4 * SIZE
ST y2, YY, -3 * SIZE
ST y3, YY, -2 * SIZE
ST y4, YY, -1 * SIZE
.align 3
.L26:
andi I, M, 2
bge $r0, I, .L27
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
MADD y1, a1, x1, y1
addi.d YY, YY, 2 * SIZE
MADD y2, a2, x1, y2
addi.d AO1, AO1, 2 * SIZE
ST y1, YY, -2 * SIZE
ST y2, YY, -1 * SIZE
.align 3
.L27:
andi I, M, 1
bge $r0, I, .L900
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
MADD y1, a1, x1, y1
ST y1, YY, 0 * SIZE
.align 3
.L900:
li YORIG, SIZE
srai.d I, M, 2
beq INCY, YORIG, .L999
move XX, BUFFER
bge $r0, I, .L905
.align 3
.L902:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
LD a3, XX, 2 * SIZE
LD a4, XX, 3 * SIZE
ST a1, Y, 0 * SIZE
add.d Y, Y, INCY
ST a2, Y, 0 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
add.d Y, Y, INCY
ST a4, Y, 0 * SIZE
add.d Y, Y, INCY
addi.d I, I, -1
addi.d XX, XX, 4 * SIZE
blt $r0, I, .L902
.align 3
.L905:
andi I, M, 3
bge $r0, I, .L999
.align 3
.L906:
LD a1, XX, 0 * SIZE
addi.d XX, XX, 1 * SIZE
ST a1, Y, 0 * SIZE
addi.d I, I, -1
add.d Y, Y, INCY
blt $r0, I, .L906
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
#ifndef __64BIT__
fld.d $f18, $sp, 16
fld.d $f19, $sp, 24
fld.d $f20, $sp, 32
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 16
#else
addi.d $sp, $sp, 48
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

436
kernel/loongarch64/gemv_t.S Normal file
View File

@ -0,0 +1,436 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
/* Unused param dummy1 */
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r16
#define XORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA $f0
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define y1 $f14
#define y2 $f15
#define y3 $f16
#define y4 $f17
#define x1 $f3
#define x2 $f1
#define x3 $f2
#define x4 $f4
#define x5 $f5
#define x6 $f6
#define x7 $f7
#define x8 $f18
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifdef __64BIT__
addi.d $sp, $sp, -16
#else
addi.d $sp, $sp, -32
#endif
MTC y1, $r0
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
slli.d LDA, LDA, BASE_SHIFT
#ifndef __64BIT__
fst.d $f18, $sp, 16
#endif
slli.d INCX, INCX, BASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, BASE_SHIFT
bge $r0, N, .L999
li I, SIZE
move XORIG, X
beq INCX, I, .L10
srai.d I, M, 2
move XORIG, BUFFER
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
ST a3, YY, 2 * SIZE
ST a4, YY, 3 * SIZE
addi.d I, I, -1
addi.d YY, YY, 4 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, YY, 0 * SIZE
addi.d I, I, -1
addi.d YY, YY, 1 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
move YY, Y
bge $r0, J, .L20
.align 3
.L11:
move AO1, A
MOV y2, y1
add.d AO2, A, LDA
MOV y3, y1
add.d A, AO2, LDA
MOV y4, y1
srai.d I, M, 3
move XX, XORIG
bge $r0, I, .L15
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO2, 0 * SIZE
LD x2, XX, 1 * SIZE
LD a3, AO1, 1 * SIZE
LD x3, XX, 2 * SIZE
LD a4, AO2, 1 * SIZE
LD x4, XX, 3 * SIZE
LD a5, AO1, 2 * SIZE
LD x5, XX, 4 * SIZE
LD a6, AO2, 2 * SIZE
LD x6, XX, 5 * SIZE
LD a7, AO1, 3 * SIZE
LD x7, XX, 6 * SIZE
LD a8, AO2, 3 * SIZE
addi.d I, I, -1
LD x8, XX, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y2, a2, x1, y2
LD a2, AO2, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
MADD y4, a4, x2, y4
LD a4, AO2, 5 * SIZE
LD x1, XX, 8 * SIZE
LD x2, XX, 9 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y2, a6, x3, y2
LD a6, AO2, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
MADD y4, a8, x4, y4
LD a8, AO2, 7 * SIZE
LD x3, XX, 10 * SIZE
LD x4, XX, 11 * SIZE
MADD y1, a1, x5, y1
LD a1, AO1, 8 * SIZE
MADD y2, a2, x5, y2
LD a2, AO2, 8 * SIZE
MADD y3, a3, x6, y3
LD a3, AO1, 9 * SIZE
MADD y4, a4, x6, y4
LD a4, AO2, 9 * SIZE
LD x5, XX, 12 * SIZE
LD x6, XX, 13 * SIZE
MADD y1, a5, x7, y1
LD a5, AO1, 10 * SIZE
MADD y2, a6, x7, y2
LD a6, AO2, 10 * SIZE
MADD y3, a7, x8, y3
LD a7, AO1, 11 * SIZE
MADD y4, a8, x8, y4
LD a8, AO2, 11 * SIZE
LD x7, XX, 14 * SIZE
LD x8, XX, 15 * SIZE
addi.d I, I, -1
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d AO2, AO2, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y2, a2, x1, y2
LD a2, AO2, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
MADD y4, a4, x2, y4
LD a4, AO2, 5 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y2, a6, x3, y2
LD a6, AO2, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
MADD y4, a8, x4, y4
LD a8, AO2, 7 * SIZE
MADD y1, a1, x5, y1
MADD y2, a2, x5, y2
MADD y3, a3, x6, y3
MADD y4, a4, x6, y4
MADD y1, a5, x7, y1
addi.d XX, XX, 8 * SIZE
MADD y2, a6, x7, y2
addi.d AO1, AO1, 8 * SIZE
MADD y3, a7, x8, y3
addi.d AO2, AO2, 8 * SIZE
MADD y4, a8, x8, y4
.align 3
.L15:
andi I, M, 4
bge $r0, I, .L17
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO2, 0 * SIZE
LD a3, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a4, AO2, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x3, XX, 2 * SIZE
MADD y1, a1, x1, y1
LD a6, AO2, 2 * SIZE
MADD y2, a2, x1, y2
LD a7, AO1, 3 * SIZE
MADD y3, a3, x2, y3
LD x4, XX, 3 * SIZE
MADD y4, a4, x2, y4
LD a8, AO2, 3 * SIZE
MADD y1, a5, x3, y1
MADD y2, a6, x3, y2
addi.d XX, XX, 4 * SIZE
MADD y3, a7, x4, y3
addi.d AO1, AO1, 4 * SIZE
MADD y4, a8, x4, y4
addi.d AO2, AO2, 4 * SIZE
.align 3
.L17:
andi I, M, 3
ADD y1, y1, y3
ADD y2, y2, y4
bge $r0, I, .L19
.align 3
.L18:
LD x1, XX, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD a2, AO2, 0 * SIZE
addi.d I, I, -1
addi.d XX, XX, 1 * SIZE
addi.d AO1, AO1, 1 * SIZE
addi.d AO2, AO2, 1 * SIZE
MADD y1, a1, x1, y1
MADD y2, a2, x1, y2
blt $r0, I, .L18
.align 3
.L19:
LD a1, Y, 0 * SIZE
add.d Y, Y, INCY
LD a2, Y, 0 * SIZE
add.d Y, Y, INCY
MADD a1, y1, ALPHA, a1
addi.d J, J, -1
MADD a2, y2, ALPHA, a2
MTC y1, $r0
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
ST a2, YY, 0 * SIZE
add.d YY, YY, INCY
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
MOV y3, y1
move AO1, A
bge $r0, J, .L999
srai.d I, M, 3
move XX, XORIG
bge $r0, I, .L25
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a3, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x3, XX, 2 * SIZE
LD a7, AO1, 3 * SIZE
LD x4, XX, 3 * SIZE
LD x5, XX, 4 * SIZE
LD x6, XX, 5 * SIZE
LD x7, XX, 6 * SIZE
addi.d I, I, -1
LD x8, XX, 7 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
LD x1, XX, 8 * SIZE
LD x2, XX, 9 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
LD x3, XX, 10 * SIZE
LD x4, XX, 11 * SIZE
MADD y1, a1, x5, y1
LD a1, AO1, 8 * SIZE
MADD y3, a3, x6, y3
LD a3, AO1, 9 * SIZE
LD x5, XX, 12 * SIZE
LD x6, XX, 13 * SIZE
MADD y1, a5, x7, y1
LD a5, AO1, 10 * SIZE
MADD y3, a7, x8, y3
LD a7, AO1, 11 * SIZE
LD x7, XX, 14 * SIZE
LD x8, XX, 15 * SIZE
addi.d I, I, -1
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
MADD y1, a1, x1, y1
LD a1, AO1, 4 * SIZE
MADD y3, a3, x2, y3
LD a3, AO1, 5 * SIZE
MADD y1, a5, x3, y1
LD a5, AO1, 6 * SIZE
MADD y3, a7, x4, y3
LD a7, AO1, 7 * SIZE
MADD y1, a1, x5, y1
MADD y3, a3, x6, y3
MADD y1, a5, x7, y1
MADD y3, a7, x8, y3
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
.align 3
.L25:
andi I, M, 4
bge $r0, I, .L27
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a3, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x3, XX, 2 * SIZE
MADD y1, a1, x1, y1
LD a7, AO1, 3 * SIZE
MADD y3, a3, x2, y3
LD x4, XX, 3 * SIZE
MADD y1, a5, x3, y1
addi.d XX, XX, 4 * SIZE
MADD y3, a7, x4, y3
addi.d AO1, AO1, 4 * SIZE
.align 3
.L27:
andi I, M, 3
ADD y1, y1, y3
bge $r0, I, .L29
.align 3
.L28:
LD x1, XX, 0 * SIZE
LD a1, AO1, 0 * SIZE
addi.d I, I, -1
addi.d XX, XX, 1 * SIZE
addi.d AO1, AO1, 1 * SIZE
MADD y1, a1, x1, y1
blt $r0, I, .L28
.align 3
.L29:
LD a1, Y, 0 * SIZE
add.d Y, Y, INCY
MADD a1, y1, ALPHA, a1
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
#ifndef __64BIT__
fld.d $f18, $sp, 16
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 16
#else
addi.d $sp, $sp, 32
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

233
kernel/loongarch64/iamax.S Normal file
View File

@ -0,0 +1,233 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
li x1, 1
bge $r0, N, .L999
FABS s1, a1
add.d X, X, INCX
FABS s2, a1
li x2, 1
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
li x3, 1
li TEMP, 2
li x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d I, I, -1
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
add.d X, X, INCX
CMPLT $fcc2, s3, t3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
FABS t1, a5
addi.d TEMP, TEMP, 4
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
add.d X, X, INCX
FABS t1, a1
addi.d I, I, -1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

233
kernel/loongarch64/iamin.S Normal file
View File

@ -0,0 +1,233 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
li x1, 1
bge $r0, N, .L999
FABS s1, a1
add.d X, X, INCX
FABS s2, a1
li x2, 1
FABS s3, a1
srai.d I, N, 3
FABS s4, a1
li x3, 1
li TEMP, 2
li x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
add.d X, X, INCX
FABS t3, a3
LD a2, X, 0 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a3, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a4, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d I, I, -1
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
add.d X, X, INCX
FABS t3, a7
LD a6, X, 0 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t2, s2
add.d X, X, INCX
CMPLT $fcc2, t3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, t4, s4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
FABS t1, a5
addi.d TEMP, TEMP, 4
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t2, s2
CMPLT $fcc2, t3, s3
CMPLT $fcc3, t4, s4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t2, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t3, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t4, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
add.d X, X, INCX
FABS t1, a1
addi.d I, I, -1
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

217
kernel/loongarch64/izamax.S Normal file
View File

@ -0,0 +1,217 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
ADD s2, t1, t2
ADD s3, t1, t2
ADD s4, t1, t2
addi.d N, N, -1
li x1, 1
bge $r0, N, .L999
add.d X, X, INCX
li x2, 1
srai.d I, N, 2
li x3, 1
li TEMP, 2
li x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t3
LD a8, X, 1 * SIZE
CMPLT $fcc2, s3, t5
add.d X, X, INCX
CMPLT $fcc3, s4, t7
addi.d I, I, -1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t3
CMPLT $fcc2, s3, t5
CMPLT $fcc3, s4, t7
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
addi.d I, I, -1
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

217
kernel/loongarch64/izamin.S Normal file
View File

@ -0,0 +1,217 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r18
#define TEMP $r7
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define x1 $r17
#define x2 $r8
#define x3 $r9
#define x4 $r10
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
li x1, 0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
ADD s2, t1, t2
ADD s3, t1, t2
ADD s4, t1, t2
addi.d N, N, -1
li x1, 1
bge $r0, N, .L999
add.d X, X, INCX
li x2, 1
srai.d I, N, 2
li x3, 1
li TEMP, 2
li x4, 1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t3, s2
LD a8, X, 1 * SIZE
CMPLT $fcc2, t5, s3
add.d X, X, INCX
CMPLT $fcc3, t7, s4
addi.d I, I, -1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t3, s2
CMPLT $fcc2, t5, s3
CMPLT $fcc3, t7, s4
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
CMOVT s2, s2, t3, $fcc1
MOVT(x2, TEMP, $fcc1)
CMOVT s3, s3, t5, $fcc2
MOVT(x3, TEMP, $fcc2)
CMOVT s4, s4, t7, $fcc3
MOVT(x4, TEMP, $fcc3)
addi.d TEMP, TEMP, 4
addi.d x2, x2, 1
addi.d x3, x3, 2
addi.d x4, x4, 3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
addi.d I, I, -1
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
MOVT(x1, TEMP, $fcc0)
addi.d TEMP, TEMP, 1
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
MOVT(x1, x2, $fcc0)
CMOVT s3, s3, s4, $fcc1
MOVT(x3, x4, $fcc1)
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
MOVT(x1, x3, $fcc0)
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

174
kernel/loongarch64/max.S Normal file
View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD s1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
MOV s2, s1
bge $r0, N, .L999
MOV s3, s1
srai.d I, N, 3
MOV s4, s1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
CMPLT $fcc0, s1, a1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, a2
add.d X, X, INCX
CMPLT $fcc2, s3, a3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, a4
add.d X, X, INCX
CMOVT s1, s1, a1, $fcc0
LD a1, X, 0 * SIZE
CMOVT s2, s2, a2, $fcc1
add.d X, X, INCX
CMOVT s3, s3, a3, $fcc2
LD a2, X, 0 * SIZE
CMOVT s4, s4, a4, $fcc3
add.d X, X, INCX
CMPLT $fcc0, s1, a5
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, a6
add.d X, X, INCX
CMPLT $fcc2, s3, a7
LD a4, X, 0 * SIZE
CMPLT $fcc3, s4, a8
add.d X, X, INCX
CMOVT s1, s1, a5, $fcc0
LD a5, X, 0 * SIZE
CMOVT s2, s2, a6, $fcc1
add.d X, X, INCX
CMOVT s3, s3, a7, $fcc2
LD a6, X, 0 * SIZE
CMOVT s4, s4, a8, $fcc3
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L12
.align 3
.L13:
CMPLT $fcc0, s1, a1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, a2
add.d X, X, INCX
CMPLT $fcc2, s3, a3
LD a8, X, 0 * SIZE
CMPLT $fcc3, s4, a4
add.d X, X, INCX
CMOVT s1, s1, a1, $fcc0
CMOVT s2, s2, a2, $fcc1
CMOVT s3, s3, a3, $fcc2
CMOVT s4, s4, a4, $fcc3
CMPLT $fcc0, s1, a5
CMPLT $fcc1, s2, a6
CMPLT $fcc2, s3, a7
CMPLT $fcc3, s4, a8
CMOVT s1, s1, a5, $fcc0
CMOVT s2, s2, a6, $fcc1
CMOVT s3, s3, a7, $fcc2
CMOVT s4, s4, a8, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
CMPLT $fcc0, s1, a1
CMOVT s1, s1, a1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

174
kernel/loongarch64/min.S Normal file
View File

@ -0,0 +1,174 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
LD s1, X, 0 * SIZE
addi.d N, N, -1
add.d X, X, INCX
MOV s2, s1
bge $r0, N, .L999
MOV s3, s1
srai.d I, N, 3
MOV s4, s1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
CMPLT $fcc0, a1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, a2, s2
add.d X, X, INCX
CMPLT $fcc2, a3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, a4, s4
add.d X, X, INCX
CMOVT s1, s1, a1, $fcc0
LD a1, X, 0 * SIZE
CMOVT s2, s2, a2, $fcc1
add.d X, X, INCX
CMOVT s3, s3, a3, $fcc2
LD a2, X, 0 * SIZE
CMOVT s4, s4, a4, $fcc3
add.d X, X, INCX
CMPLT $fcc0, a5, s1
LD a3, X, 0 * SIZE
CMPLT $fcc1, a6, s2
add.d X, X, INCX
CMPLT $fcc2, a7, s3
LD a4, X, 0 * SIZE
CMPLT $fcc3, a8, s4
add.d X, X, INCX
CMOVT s1, s1, a5, $fcc0
LD a5, X, 0 * SIZE
CMOVT s2, s2, a6, $fcc1
add.d X, X, INCX
CMOVT s3, s3, a7, $fcc2
LD a6, X, 0 * SIZE
CMOVT s4, s4, a8, $fcc3
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L12
.align 3
.L13:
CMPLT $fcc0, a1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, a2, s2
add.d X, X, INCX
CMPLT $fcc2, a3, s3
LD a8, X, 0 * SIZE
CMPLT $fcc3, a4, s4
add.d X, X, INCX
CMOVT s1, s1, a1, $fcc0
CMOVT s2, s2, a2, $fcc1
CMOVT s3, s3, a3, $fcc2
CMOVT s4, s4, a4, $fcc3
CMPLT $fcc0, a5, s1
CMPLT $fcc1, a6, s2
CMPLT $fcc2, a7, s3
CMPLT $fcc3, a8, s4
CMOVT s1, s1, a5, $fcc0
CMOVT s2, s2, a6, $fcc1
CMOVT s3, s3, a7, $fcc2
CMOVT s4, s4, a8, $fcc3
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
CMPLT $fcc0, a1, s1
CMOVT s1, s1, a1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

330
kernel/loongarch64/scal.S Normal file
View File

@ -0,0 +1,330 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define I $r17
#define TEMP $r18
#define XX $r5
#define ALPHA $f0
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define t1 $f14
#define t2 $f15
#define t3 $f16
#define t4 $f17
PROLOGUE
li TEMP, SIZE
MTC a1, $r0
slli.d INCX, INCX, BASE_SHIFT
bge $r0, N, .L999
CMPEQ $fcc0, ALPHA, a1
bceqz $fcc0, .L50
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L15
.align 3
.L12:
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
ST a1, X, 2 * SIZE
ST a1, X, 3 * SIZE
ST a1, X, 4 * SIZE
ST a1, X, 5 * SIZE
ST a1, X, 6 * SIZE
ST a1, X, 7 * SIZE
addi.w I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L12
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
ST a1, X, 0 * SIZE
addi.d I, I, -1
addi.d X, X, SIZE
blt $r0, I, .L16
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20:
srai.d I, N, 3
bge $r0, I, .L25
.align 3
.L22:
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L22
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
addi.d I, I, -1
ST a1, X, 0 * SIZE
add.d X, X, INCX
blt $r0, I, .L26
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L50:
srai.d I, N, 3
bne INCX, TEMP, .L60
addi.d I, I, -1
blt I, $r0, .L55
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
LD a6, X, 5 * SIZE
LD a7, X, 6 * SIZE
LD a8, X, 7 * SIZE
bge $r0, I, .L53
.align 3
.L52:
MUL t1, ALPHA, a1
LD a1, X, 8 * SIZE
MUL t2, ALPHA, a2
LD a2, X, 9 * SIZE
MUL t3, ALPHA, a3
LD a3, X, 10 * SIZE
MUL t4, ALPHA, a4
LD a4, X, 11 * SIZE
ST t1, X, 0 * SIZE
MUL t1, ALPHA, a5
LD a5, X, 12 * SIZE
ST t2, X, 1 * SIZE
MUL t2, ALPHA, a6
LD a6, X, 13 * SIZE
ST t3, X, 2 * SIZE
MUL t3, ALPHA, a7
LD a7, X, 14 * SIZE
ST t4, X, 3 * SIZE
MUL t4, ALPHA, a8
LD a8, X, 15 * SIZE
addi.d I, I, -1
ST t1, X, 4 * SIZE
ST t2, X, 5 * SIZE
ST t3, X, 6 * SIZE
ST t4, X, 7 * SIZE
addi.d X, X, 8 * SIZE
blt $r0, I, .L52
.align 3
.L53:
MUL t1, ALPHA, a1
MUL t2, ALPHA, a2
MUL t3, ALPHA, a3
MUL t4, ALPHA, a4
ST t1, X, 0 * SIZE
MUL t1, ALPHA, a5
ST t2, X, 1 * SIZE
MUL t2, ALPHA, a6
ST t3, X, 2 * SIZE
MUL t3, ALPHA, a7
ST t4, X, 3 * SIZE
MUL t4, ALPHA, a8
ST t1, X, 4 * SIZE
ST t2, X, 5 * SIZE
ST t3, X, 6 * SIZE
ST t4, X, 7 * SIZE
addi.d X, X, 8 * SIZE
.align 3
.L55:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L56:
LD a1, X, 0 * SIZE
MUL t1, ALPHA, a1
addi.d X, X, SIZE
addi.d I, I, -1
ST t1, X, -1 * SIZE
blt $r0, I, .L56
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L60:
srai.d I, N, 3
move XX, X
addi.d I, I, -1
blt I, $r0, .L65
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
add.d X, X, INCX
bge $r0, I, .L63
.align 3
.L62:
MUL t1, ALPHA, a1
LD a1, X, 0 * SIZE
add.d X, X, INCX
MUL t2, ALPHA, a2
LD a2, X, 0 * SIZE
add.d X, X, INCX
MUL t3, ALPHA, a3
LD a3, X, 0 * SIZE
add.d X, X, INCX
MUL t4, ALPHA, a4
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST t1, XX, 0 * SIZE
add.d XX, XX, INCX
ST t2, XX, 0 * SIZE
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
add.d XX, XX, INCX
ST t4, XX, 0 * SIZE
add.d XX, XX, INCX
MUL t1, ALPHA, a5
LD a5, X, 0 * SIZE
add.d X, X, INCX
MUL t2, ALPHA, a6
LD a6, X, 0 * SIZE
add.d X, X, INCX
MUL t3, ALPHA, a7
LD a7, X, 0 * SIZE
add.d X, X, INCX
MUL t4, ALPHA, a8
LD a8, X, 0 * SIZE
add.d X, X, INCX
ST t1, XX, 0 * SIZE
add.d XX, XX, INCX
ST t2, XX, 0 * SIZE
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
add.d XX, XX, INCX
ST t4, XX, 0 * SIZE
addi.d I, I, -1
add.d XX, XX, INCX
blt $r0, I, .L62
.align 3
.L63:
MUL t1, ALPHA, a1
MUL t2, ALPHA, a2
MUL t3, ALPHA, a3
MUL t4, ALPHA, a4
ST t1, XX, 0 * SIZE
add.d XX, XX, INCX
ST t2, XX, 0 * SIZE
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
add.d XX, XX, INCX
ST t4, XX, 0 * SIZE
add.d XX, XX, INCX
MUL t1, ALPHA, a5
MUL t2, ALPHA, a6
MUL t3, ALPHA, a7
MUL t4, ALPHA, a8
ST t1, XX, 0 * SIZE
add.d XX, XX, INCX
ST t2, XX, 0 * SIZE
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
add.d XX, XX, INCX
ST t4, XX, 0 * SIZE
add.d XX, XX, INCX
.align 3
.L65:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L66:
LD a1, X, 0 * SIZE
MUL t1, ALPHA, a1
addi.d I, I, -1
ST t1, X, 0 * SIZE
add.d X, X, INCX
blt $r0, I, .L66
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

249
kernel/loongarch64/snrm2.S Normal file
View File

@ -0,0 +1,249 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define a5 $f16
#define a6 $f17
#define a7 $f0
#define a8 $f1
#define s1 $f22
#define s2 $f8
#define t1 $f23
#define t2 $f9
#define t3 $f10
#define t4 $f11
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
movgr2fr.d s1, $r0
li TEMP, SIZE
fmov.d s2, s1
bge $r0, N, .L999
slli.d INCX, INCX, BASE_SHIFT
bge $r0, INCX, .L999
srai.d I, N, 3
bne INCX, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
LD a6, X, 5 * SIZE
fcvt.d.s t2, a2
LD a7, X, 6 * SIZE
fcvt.d.s t3, a3
LD a8, X, 7 * SIZE
fcvt.d.s t4, a4
bge $r0, I, .L13
.align 3
.L12:
fmadd.d s1, t1, t1, s1
LD a1, X, 8 * SIZE
fcvt.d.s t1, a5
NOP
fmadd.d s2, t2, t2, s2
LD a2, X, 9 * SIZE
fcvt.d.s t2, a6
NOP
fmadd.d s1, t3, t3, s1
LD a3, X, 10 * SIZE
fcvt.d.s t3, a7
NOP
fmadd.d s2, t4, t4, s2
LD a4, X, 11 * SIZE
fcvt.d.s t4, a8
NOP
fmadd.d s1, t1, t1, s1
LD a5, X, 12 * SIZE
fcvt.d.s t1, a1
NOP
fmadd.d s2, t2, t2, s2
LD a6, X, 13 * SIZE
fcvt.d.s t2, a2
addi.d I, I, -1
fmadd.d s1, t3, t3, s1
LD a7, X, 14 * SIZE
fcvt.d.s t3, a3
addi.d X, X, 8 * SIZE
fmadd.d s2, t4, t4, s2
LD a8, X, 7 * SIZE
fcvt.d.s t4, a4
blt $r0, I, .L12
.align 3
.L13:
fmadd.d s1, t1, t1, s1
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
fcvt.d.s t2, a6
fmadd.d s1, t3, t3, s1
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
fcvt.d.s t4, a8
fmadd.d s1, t1, t1, s1
fmadd.d s2, t2, t2, s2
fmadd.d s1, t3, t3, s1
fmadd.d s2, t4, t4, s2
addi.d X, X, 8 * SIZE
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
fmadd.d s1, t1, t1, s1
addi.d X, X, SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
bge $r0, I, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD a8, X, 0 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
fcvt.d.s t2, a2
fcvt.d.s t3, a3
fcvt.d.s t4, a4
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
fmadd.d s1, t1, t1, s1
LD a1, X, 0 * SIZE
fcvt.d.s t1, a5
add.d X, X, INCX
fmadd.d s2, t2, t2, s2
LD a2, X, 0 * SIZE
fcvt.d.s t2, a6
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a3, X, 0 * SIZE
fcvt.d.s t3, a7
add.d X, X, INCX
fmadd.d s2, t4, t4, s2
LD a4, X, 0 * SIZE
fcvt.d.s t4, a8
add.d X, X, INCX
fmadd.d s1, t1, t1, s1
LD a5, X, 0 * SIZE
fcvt.d.s t1, a1
add.d X, X, INCX
fmadd.d s2, t2, t2, s2
LD a6, X, 0 * SIZE
fcvt.d.s t2, a2
add.d X, X, INCX
fmadd.d s1, t3, t3, s1
LD a7, X, 0 * SIZE
fcvt.d.s t3, a3
add.d X, X, INCX
fmadd.d s2, t4, t4, s2
LD a8, X, 0 * SIZE
fcvt.d.s t4, a4
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L23
.align 3
.L24:
fmadd.d s1, t1, t1, s1
fcvt.d.s t1, a5
fmadd.d s2, t2, t2, s2
fcvt.d.s t2, a6
fmadd.d s1, t3, t3, s1
fcvt.d.s t3, a7
fmadd.d s2, t4, t4, s2
fcvt.d.s t4, a8
fmadd.d s1, t1, t1, s1
fmadd.d s2, t2, t2, s2
fmadd.d s1, t3, t3, s1
fmadd.d s2, t4, t4, s2
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
addi.d I, I, -1
fcvt.d.s t1, a1
add.d X, X, INCX
fmadd.d s1, t1, t1, s1
blt $r0, I, .L26
.align 3
.L999:
fadd.d s1, s1, s2
fsqrt.d s1, s1
move $r4, $r17
fcvt.s.d $f0, s1
jirl $r0, $r1, 0x0
EPILOGUE

330
kernel/loongarch64/swap.S Normal file
View File

@ -0,0 +1,330 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define Y $r9
#define INCY $r10
#define I $r17
#define TEMP $r18
#define XX $r5
#define YY $r6
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define b1 $f14
#define b2 $f15
#define b3 $f16
#define b4 $f17
#define b5 $f0
#define b6 $f1
#define b7 $f2
#define b8 $f3
PROLOGUE
li TEMP, SIZE
slli.d INCX, INCX, BASE_SHIFT
bge $r0, N, .L999
slli.d INCY, INCY, BASE_SHIFT
bne INCX, TEMP, .L20
srai.d I, N, 3
bne INCY, TEMP, .L20
addi.d I, I, -1
blt I, $r0, .L15
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
LD a2, X, 1 * SIZE
LD b2, Y, 1 * SIZE
LD a3, X, 2 * SIZE
LD b3, Y, 2 * SIZE
LD a4, X, 3 * SIZE
LD b4, Y, 3 * SIZE
LD a5, X, 4 * SIZE
LD b5, Y, 4 * SIZE
LD a6, X, 5 * SIZE
LD b6, Y, 5 * SIZE
LD a7, X, 6 * SIZE
LD b7, Y, 6 * SIZE
LD a8, X, 7 * SIZE
LD b8, Y, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ST a1, Y, 0 * SIZE
LD a1, X, 8 * SIZE
ST b1, X, 0 * SIZE
LD b1, Y, 8 * SIZE
ST a2, Y, 1 * SIZE
LD a2, X, 9 * SIZE
ST b2, X, 1 * SIZE
LD b2, Y, 9 * SIZE
ST a3, Y, 2 * SIZE
LD a3, X, 10 * SIZE
ST b3, X, 2 * SIZE
LD b3, Y, 10 * SIZE
ST a4, Y, 3 * SIZE
LD a4, X, 11 * SIZE
ST b4, X, 3 * SIZE
LD b4, Y, 11 * SIZE
ST a5, Y, 4 * SIZE
LD a5, X, 12 * SIZE
ST b5, X, 4 * SIZE
LD b5, Y, 12 * SIZE
ST a6, Y, 5 * SIZE
LD a6, X, 13 * SIZE
ST b6, X, 5 * SIZE
LD b6, Y, 13 * SIZE
ST a7, Y, 6 * SIZE
LD a7, X, 14 * SIZE
ST b7, X, 6 * SIZE
LD b7, Y, 14 * SIZE
ST a8, Y, 7 * SIZE
LD a8, X, 15 * SIZE
ST b8, X, 7 * SIZE
LD b8, Y, 15 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
ST a1, Y, 0 * SIZE
ST b1, X, 0 * SIZE
ST a2, Y, 1 * SIZE
ST b2, X, 1 * SIZE
ST a3, Y, 2 * SIZE
ST b3, X, 2 * SIZE
ST a4, Y, 3 * SIZE
ST b4, X, 3 * SIZE
ST a5, Y, 4 * SIZE
ST b5, X, 4 * SIZE
ST a6, Y, 5 * SIZE
ST b6, X, 5 * SIZE
ST a7, Y, 6 * SIZE
ST b7, X, 6 * SIZE
ST a8, Y, 7 * SIZE
ST b8, X, 7 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
.align 3
.L15:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
addi.d X, X, SIZE
addi.d I, I, -1
addi.d Y, Y, SIZE
ST b1, X, -1 * SIZE
ST a1, Y, -1 * SIZE
blt $r0, I, .L16
b .L999
.align 3
.L20:
srai.d I, N, 3
move XX, X
move YY, Y
addi.d I, I, -1
blt I, $r0, .L25
LD a1, X, 0 * SIZE
add.d X, X, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
LD a2, X, 0 * SIZE
add.d X, X, INCX
LD b2, Y, 0 * SIZE
add.d Y, Y, INCY
LD a3, X, 0 * SIZE
add.d X, X, INCX
LD b3, Y, 0 * SIZE
add.d Y, Y, INCY
LD a4, X, 0 * SIZE
add.d X, X, INCX
LD b4, Y, 0 * SIZE
add.d Y, Y, INCY
LD a5, X, 0 * SIZE
add.d X, X, INCX
LD b5, Y, 0 * SIZE
add.d Y, Y, INCY
LD a6, X, 0 * SIZE
add.d X, X, INCX
LD b6, Y, 0 * SIZE
add.d Y, Y, INCY
LD a7, X, 0 * SIZE
add.d X, X, INCX
LD b7, Y, 0 * SIZE
add.d Y, Y, INCY
LD a8, X, 0 * SIZE
add.d X, X, INCX
LD b8, Y, 0 * SIZE
add.d Y, Y, INCY
bge $r0, I, .L23
.align 3
.L22:
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
LD a1, X, 0 * SIZE
add.d X, X, INCX
ST b1, XX, 0 * SIZE
add.d XX, XX, INCX
LD b1, Y, 0 * SIZE
add.d Y, Y, INCY
ST a2, YY, 0 * SIZE
add.d YY, YY, INCY
LD a2, X, 0 * SIZE
add.d X, X, INCX
ST b2, XX, 0 * SIZE
add.d XX, XX, INCX
LD b2, Y, 0 * SIZE
add.d Y, Y, INCY
ST a3, YY, 0 * SIZE
add.d YY, YY, INCY
LD a3, X, 0 * SIZE
add.d X, X, INCX
ST b3, XX, 0 * SIZE
add.d XX, XX, INCX
LD b3, Y, 0 * SIZE
add.d Y, Y, INCY
ST a4, YY, 0 * SIZE
add.d YY, YY, INCY
LD a4, X, 0 * SIZE
add.d X, X, INCX
ST b4, XX, 0 * SIZE
add.d XX, XX, INCX
LD b4, Y, 0 * SIZE
add.d Y, Y, INCY
ST a5, YY, 0 * SIZE
add.d YY, YY, INCY
LD a5, X, 0 * SIZE
add.d X, X, INCX
ST b5, XX, 0 * SIZE
add.d XX, XX, INCX
LD b5, Y, 0 * SIZE
add.d Y, Y, INCY
ST a6, YY, 0 * SIZE
add.d YY, YY, INCY
LD a6, X, 0 * SIZE
add.d X, X, INCX
ST b6, XX, 0 * SIZE
add.d XX, XX, INCX
LD b6, Y, 0 * SIZE
add.d Y, Y, INCY
ST a7, YY, 0 * SIZE
add.d YY, YY, INCY
LD a7, X, 0 * SIZE
add.d X, X, INCX
ST b7, XX, 0 * SIZE
add.d XX, XX, INCX
LD b7, Y, 0 * SIZE
add.d Y, Y, INCY
ST a8, YY, 0 * SIZE
add.d YY, YY, INCY
LD a8, X, 0 * SIZE
add.d X, X, INCX
ST b8, XX, 0 * SIZE
add.d XX, XX, INCX
LD b8, Y, 0 * SIZE
addi.d I, I, -1
add.d Y, Y, INCY
blt $r0, I, .L22
.align 3
.L23:
ST a1, YY, 0 * SIZE
add.d YY, YY, INCY
ST b1, XX, 0 * SIZE
add.d XX, XX, INCX
ST a2, YY, 0 * SIZE
add.d YY, YY, INCY
ST b2, XX, 0 * SIZE
add.d XX, XX, INCX
ST a3, YY, 0 * SIZE
add.d YY, YY, INCY
ST b3, XX, 0 * SIZE
add.d XX, XX, INCX
ST a4, YY, 0 * SIZE
add.d YY, YY, INCY
ST b4, XX, 0 * SIZE
add.d XX, XX, INCX
ST a5, YY, 0 * SIZE
add.d YY, YY, INCY
ST b5, XX, 0 * SIZE
add.d XX, XX, INCX
ST a6, YY, 0 * SIZE
add.d YY, YY, INCY
ST b6, XX, 0 * SIZE
add.d XX, XX, INCX
ST a7, YY, 0 * SIZE
add.d YY, YY, INCY
ST b7, XX, 0 * SIZE
add.d XX, XX, INCX
ST a8, YY, 0 * SIZE
add.d YY, YY, INCY
ST b8, XX, 0 * SIZE
add.d XX, XX, INCX
.align 3
.L25:
andi I, N, 7
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD b1, Y, 0 * SIZE
addi.d I, I, -1
ST a1, Y, 0 * SIZE
ST b1, X, 0 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L26
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

190
kernel/loongarch64/zamax.S Normal file
View File

@ -0,0 +1,190 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
bge $r0, N, .L999
ADD s2, t1, t2
srai.d I, N, 2
ADD s3, t1, t2
ADD s4, t1, t2
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t3
LD a8, X, 1 * SIZE
CMPLT $fcc2, s3, t5
add.d X, X, INCX
CMPLT $fcc3, s4, t7
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t3, $fcc1
CMOVT s3, s3, t5, $fcc2
CMOVT s4, s4, t7, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t3
CMPLT $fcc2, s3, t5
CMPLT $fcc3, s4, t7
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t3, $fcc1
CMOVT s3, s3, t5, $fcc2
CMOVT s4, s4, t7, $fcc3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d I, I, -1
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
CMPLT $fcc0, s1, t1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

198
kernel/loongarch64/zamin.S Normal file
View File

@ -0,0 +1,198 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define t5 $f4
#define t6 $f5
#define t7 $f6
#define t8 $f7
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
LD a1, X, 0 * SIZE
addi.d N, N, -1
LD a2, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
ADD s1, t1, t2
bge $r0, N, .L999
NOP
ADD s2, t1, t2
srai.d I, N, 2
ADD s3, t1, t2
ADD s4, t1, t2
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
LD a2, X, 1 * SIZE
FABS t3, a3
add.d X, X, INCX
FABS t4, a4
NOP
FABS t5, a5
LD a3, X, 0 * SIZE
FABS t6, a6
LD a4, X, 1 * SIZE
FABS t7, a7
add.d X, X, INCX
FABS t8, a8
NOP
ADD t1, t1, t2
LD a5, X, 0 * SIZE
ADD t3, t3, t4
LD a6, X, 1 * SIZE
ADD t5, t5, t6
add.d X, X, INCX
ADD t7, t7, t8
NOP
CMPLT $fcc0, t1, s1
LD a7, X, 0 * SIZE
CMPLT $fcc1, t3, s2
LD a8, X, 1 * SIZE
CMPLT $fcc2, t5, s3
add.d X, X, INCX
CMPLT $fcc3, t7, s4
NOP
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t3, $fcc1
NOP
CMOVT s3, s3, t5, $fcc2
CMOVT s4, s4, t7, $fcc3
blt $r0, I, .L12
NOP
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
FABS t5, a5
FABS t6, a6
FABS t7, a7
FABS t8, a8
ADD t1, t1, t2
ADD t3, t3, t4
ADD t5, t5, t6
ADD t7, t7, t8
CMPLT $fcc0, t1, s1
CMPLT $fcc1, t3, s2
CMPLT $fcc2, t5, s3
CMPLT $fcc3, t7, s4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t3, $fcc1
CMOVT s3, s3, t5, $fcc2
CMOVT s4, s4, t7, $fcc3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d I, I, -1
FABS t1, a1
FABS t2, a2
ADD t1, t1, t2
CMPLT $fcc0, t1, s1
CMOVT s1, s1, t1, $fcc0
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L998:
CMPLT $fcc0, s2, s1
CMPLT $fcc1, s4, s3
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s3, s1
CMOVT s1, s1, s3, $fcc0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
NOP
EPILOGUE

158
kernel/loongarch64/zasum.S Normal file
View File

@ -0,0 +1,158 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define I $r17
#define TEMP $r18
#define a1 $f23
#define a2 $f9
#define a3 $f10
#define a4 $f11
#define a5 $f12
#define a6 $f13
#define a7 $f14
#define a8 $f15
#define t1 $f16
#define t2 $f17
#define t3 $f0
#define t4 $f1
#define s1 $f22
#define s2 $f8
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
MTC s2, $r0
slli.d INCX, INCX, ZBASE_SHIFT
srai.d I, N, 2
bge $r0, N, .L999
bge $r0, I, .L25
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
FABS t1, a1
FABS t2, a2
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
FABS t3, a3
FABS t4, a4
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L24
.align 3
.L23:
ADD s1, s1, t1
LD a1, X, 0 * SIZE
FABS t1, a5
addi.d I, I, -1
ADD s2, s2, t2
LD a2, X, 1 * SIZE
FABS t2, a6
add.d X, X, INCX
ADD s1, s1, t3
LD a3, X, 0 * SIZE
FABS t3, a7
NOP
ADD s2, s2, t4
LD a4, X, 1 * SIZE
FABS t4, a8
add.d X, X, INCX
ADD s1, s1, t1
LD a5, X, 0 * SIZE
FABS t1, a1
NOP
ADD s2, s2, t2
LD a6, X, 1 * SIZE
FABS t2, a2
add.d X, X, INCX
ADD s1, s1, t3
LD a7, X, 0 * SIZE
FABS t3, a3
LD a8, X, 1 * SIZE
ADD s2, s2, t4
add.d X, X, INCX
FABS t4, a4
blt $r0, I, .L23
.align 3
.L24:
ADD s1, s1, t1
FABS t1, a5
ADD s2, s2, t2
FABS t2, a6
ADD s1, s1, t3
FABS t3, a7
ADD s2, s2, t4
FABS t4, a8
ADD s1, s1, t1
ADD s2, s2, t2
ADD s1, s1, t3
ADD s2, s2, t4
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
FABS t1, a1
addi.d I, I, -1
FABS t2, a2
add.d X, X, INCX
ADD s1, s1, t1
ADD s2, s2, t2
blt $r0, I, .L26
.align 3
.L999:
ADD s1, s1, s2
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

217
kernel/loongarch64/zcopy.S Normal file
View File

@ -0,0 +1,217 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
li TEMP, 2 * SIZE
NOP
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, N, .L999
slli.d INCY, INCY, ZBASE_SHIFT
bne INCX, TEMP, .L20
srai.d I, N, 2
bne INCY, TEMP, .L20
addi.d I, I, -1
blt I, $r0, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
LD a6, X, 5 * SIZE
LD a7, X, 6 * SIZE
LD a8, X, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
ST a1, Y, 0 * SIZE
LD a1, X, 8 * SIZE
ST a2, Y, 1 * SIZE
LD a2, X, 9 * SIZE
ST a3, Y, 2 * SIZE
LD a3, X, 10 * SIZE
ST a4, Y, 3 * SIZE
LD a4, X, 11 * SIZE
ST a5, Y, 4 * SIZE
LD a5, X, 12 * SIZE
ST a6, Y, 5 * SIZE
LD a6, X, 13 * SIZE
ST a7, Y, 6 * SIZE
LD a7, X, 14 * SIZE
ST a8, Y, 7 * SIZE
LD a8, X, 15 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
ST a3, Y, 2 * SIZE
ST a4, Y, 3 * SIZE
ST a5, Y, 4 * SIZE
ST a6, Y, 5 * SIZE
ST a7, Y, 6 * SIZE
ST a8, Y, 7 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d X, X, 2 * SIZE
addi.d Y, Y, 2 * SIZE
ST a1, Y, -2 * SIZE
addi.d I, I, -1
ST a2, Y, -1 * SIZE
blt $r0, I, .L16
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
NOP
.align 3
.L20:
srai.d I, N, 2
addi.d I, I, -1
blt I, $r0, .L25
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
add.d X, X, INCX
bge $r0, I, .L23
.align 3
.L22:
ST a1, Y, 0 * SIZE
LD a1, X, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
LD a2, X, 1 * SIZE
add.d X, X, INCX
ST a3, Y, 0 * SIZE
LD a3, X, 0 * SIZE
ST a4, Y, 1 * SIZE
add.d Y, Y, INCY
LD a4, X, 1 * SIZE
add.d X, X, INCX
ST a5, Y, 0 * SIZE
LD a5, X, 0 * SIZE
ST a6, Y, 1 * SIZE
add.d Y, Y, INCY
LD a6, X, 1 * SIZE
add.d X, X, INCX
ST a7, Y, 0 * SIZE
LD a7, X, 0 * SIZE
ST a8, Y, 1 * SIZE
add.d Y, Y, INCY
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L22
.align 3
.L23:
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
ST a4, Y, 1 * SIZE
add.d Y, Y, INCY
ST a5, Y, 0 * SIZE
ST a6, Y, 1 * SIZE
add.d Y, Y, INCY
ST a7, Y, 0 * SIZE
ST a8, Y, 1 * SIZE
add.d Y, Y, INCY
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
addi.d I, I, -1
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L26
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

330
kernel/loongarch64/zdot.S Normal file
View File

@ -0,0 +1,330 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define Y $r7
#define INCY $r8
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define b1 $f14
#define b2 $f15
#define b3 $f16
#define b4 $f17
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
LDINT INCY, 0(INCY)
#endif
MTC s1, $r0
MOV s2, s1
MOV s3, s2
MOV s4, s3
slli.d INCX, INCX, ZBASE_SHIFT
li TEMP, 2 * SIZE
slli.d INCY, INCY, ZBASE_SHIFT
bge $r0, N, .L999
srai.d I, N, 2
bne INCX, TEMP, .L20
bne INCY, TEMP, .L20
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD b1, Y, 0 * SIZE
addi.d I, I, -1
LD b2, Y, 1 * SIZE
bge $r0, I, .L14
.align 3
.L13:
MADD s1, b1, a1, s1
LD a3, X, 2 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 3 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 2 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 3 * SIZE
MADD s1, b3, a3, s1
LD a1, X, 4 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 5 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 4 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 5 * SIZE
MADD s1, b1, a1, s1
LD a3, X, 6 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 7 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 6 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 7 * SIZE
MADD s1, b3, a3, s1
LD a1, X, 8 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 9 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 8 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 9 * SIZE
addi.d I, I, -1
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L13
.align 3
.L14:
MADD s1, b1, a1, s1
LD a3, X, 2 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 3 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 2 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 3 * SIZE
MADD s1, b3, a3, s1
LD a1, X, 4 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 5 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 4 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 5 * SIZE
MADD s1, b1, a1, s1
LD a3, X, 6 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 7 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 6 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 7 * SIZE
MADD s1, b3, a3, s1
addi.d X, X, 8 * SIZE
MADD s2, b3, a4, s2
addi.d Y, Y, 8 * SIZE
MADD s3, b4, a3, s3
MADD s4, b4, a4, s4
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L999
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD b1, Y, 0 * SIZE
addi.d I, I, -1
LD b2, Y, 1 * SIZE
bge $r0, I, .L17
.align 3
.L16:
MADD s1, b1, a1, s1
addi.d I, I, -1
MADD s2, b1, a2, s2
LD b1, Y, 2 * SIZE
MADD s3, b2, a1, s3
LD a1, X, 2 * SIZE
MADD s4, b2, a2, s4
LD a2, X, 3 * SIZE
LD b2, Y, 3 * SIZE
addi.d X, X, 2 * SIZE
addi.d Y, Y, 2 * SIZE
blt $r0, I, .L16
.align 3
.L17:
MADD s1, b1, a1, s1
MADD s2, b1, a2, s2
MADD s3, b2, a1, s3
MADD s4, b2, a2, s4
b .L999
.align 3
.L20:
#ifdef F_INTERFACE
bgez INCX, .L21
addi.d TEMP, N, -1
mult TEMP, INCX
mflo TEMP
dsub X, X, TEMP
.align 3
.L21:
bgez INCY, .L22
addi.d TEMP, N, -1
mult TEMP, INCY
mflo TEMP
dsub Y, Y, TEMP
.align 3
.L22:
#endif
bge $r0, I, .L25
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD b1, Y, 0 * SIZE
LD b2, Y, 1 * SIZE
add.d X, X, INCX
addi.d I, I, -1
add.d Y, Y, INCY
bge $r0, I, .L24
.align 3
.L23:
MADD s1, b1, a1, s1
LD a3, X, 0 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 1 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 0 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b3, a3, s1
LD a1, X, 0 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 1 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 0 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b1, a1, s1
LD a3, X, 0 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 1 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 0 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b3, a3, s1
LD a1, X, 0 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 1 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 0 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 1 * SIZE
add.d X, X, INCX
addi.d I, I, -1
add.d Y, Y, INCY
blt $r0, I, .L23
.align 3
.L24:
MADD s1, b1, a1, s1
LD a3, X, 0 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 1 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 0 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b3, a3, s1
LD a1, X, 0 * SIZE
MADD s2, b3, a4, s2
LD a2, X, 1 * SIZE
MADD s3, b4, a3, s3
LD b1, Y, 0 * SIZE
MADD s4, b4, a4, s4
LD b2, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
MADD s1, b1, a1, s1
LD a3, X, 0 * SIZE
MADD s2, b1, a2, s2
LD a4, X, 1 * SIZE
MADD s3, b2, a1, s3
LD b3, Y, 0 * SIZE
MADD s4, b2, a2, s4
LD b4, Y, 1 * SIZE
MADD s1, b3, a3, s1
add.d X, X, INCX
MADD s2, b3, a4, s2
add.d Y, Y, INCY
MADD s3, b4, a3, s3
MADD s4, b4, a4, s4
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD b1, Y, 0 * SIZE
LD b2, Y, 1 * SIZE
MADD s1, b1, a1, s1
MADD s2, b1, a2, s2
MADD s3, b2, a1, s3
MADD s4, b2, a2, s4
add.d X, X, INCX
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L26
.align 3
.L999:
#ifndef CONJ
SUB $f0, s1, s4
#else
ADD $f0, s1, s4
#endif
#ifndef CONJ
ADD $f1, s3, s2
#else
SUB $f1, s3, s2
#endif
jirl $r0, $r1, 0x0
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,648 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r17
#define YORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA_R $f0
#define ALPHA_I $f1
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define x1 $f14
#define x2 $f15
#define x3 $f16
#define x4 $f17
#define y1 $f3
#define y2 $f4
#define y3 $f2
#define y4 $f5
#define t1 $f6
#define t2 $f7
#define t3 $f18
#define t4 $f19
#define t5 $f20
#define t6 $f21
#define t7 $f24
#define t8 $f25
#if !defined(CONJ) && !defined(XCONJ)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 NMSUB
#define MADD4 MADD
#endif
#if defined(CONJ) && !defined(XCONJ)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 MADD
#define MADD4 NMSUB
#endif
#if !defined(CONJ) && defined(XCONJ)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 MADD
#define MADD4 MADD
#endif
#if defined(CONJ) && defined(XCONJ)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 NMSUB
#define MADD4 NMSUB
#endif
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifndef __64BIT__
addi.d $sp, $sp, -64
#else
addi.d $sp, $sp, -32
#endif
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
fst.d $f24, $sp, 16
fst.d $f25, $sp, 24
#ifndef __64BIT__
fst.d $f18, $sp, 32
fst.d $f19, $sp, 40
fst.d $f20, $sp, 48
fst.d $f21, $sp, 56
#endif
slli.d LDA, LDA, ZBASE_SHIFT
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, ZBASE_SHIFT
bge $r0, N, .L999
li I, 2 * SIZE
move YORIG, Y
beq INCY, I, .L10
srai.d I, M, 2
move YORIG, BUFFER
move XX, Y
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
add.d XX, XX, INCY
LD a3, XX, 0 * SIZE
LD a4, XX, 1 * SIZE
add.d XX, XX, INCY
LD a5, XX, 0 * SIZE
LD a6, XX, 1 * SIZE
add.d XX, XX, INCY
LD a7, XX, 0 * SIZE
LD a8, XX, 1 * SIZE
add.d XX, XX, INCY
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
ST a1, YY, -8 * SIZE
ST a2, YY, -7 * SIZE
ST a3, YY, -6 * SIZE
ST a4, YY, -5 * SIZE
ST a5, YY, -4 * SIZE
ST a6, YY, -3 * SIZE
ST a7, YY, -2 * SIZE
ST a8, YY, -1 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
add.d XX, XX, INCY
addi.d I, I, -1
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
addi.d YY, YY, 2 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
bge $r0, J, .L20
.align 3
.L11:
LD x1, X, 0 * SIZE
LD x2, X, 1 * SIZE
add.d X, X, INCX
LD x3, X, 0 * SIZE
LD x4, X, 1 * SIZE
add.d X, X, INCX
MUL a1, ALPHA_R, x1
move AO1, A
MUL a2, ALPHA_I, x1
add.d AO2, A, LDA
MUL a3, ALPHA_R, x3
add.d A, AO2, LDA
MUL a4, ALPHA_I, x3
#ifndef XCONJ
NMSUB x1, x2, ALPHA_I, a1
MADD x2, x2, ALPHA_R, a2
NMSUB x3, x4, ALPHA_I, a3
MADD x4, x4, ALPHA_R, a4
#else
MADD x1, x2, ALPHA_I, a1
MSUB x2, x2, ALPHA_R, a2
MADD x3, x4, ALPHA_I, a3
MSUB x4, x4, ALPHA_R, a4
#endif
srai.d I, M, 2
move YY, YORIG
bge $r0, I, .L15
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a2, AO1, 1 * SIZE
LD y4, YY, 3 * SIZE
LD a4, AO1, 3 * SIZE
LD a5, AO2, 0 * SIZE
LD a6, AO2, 1 * SIZE
LD a7, AO2, 2 * SIZE
LD a8, AO2, 3 * SIZE
MADD1 t1, a1, x1, y1
LD y1, YY, 4 * SIZE
MADD2 t2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD1 t3, a3, x1, y3
LD y2, YY, 5 * SIZE
MADD2 t4, a3, x2, y4
LD a3, AO1, 6 * SIZE
MADD3 t1, a2, x2, t1
LD y3, YY, 6 * SIZE
MADD4 t2, a2, x1, t2
LD a2, AO1, 5 * SIZE
MADD3 t3, a4, x2, t3
LD y4, YY, 7 * SIZE
MADD4 t4, a4, x1, t4
LD a4, AO1, 7 * SIZE
MADD1 t1, a5, x3, t1
MADD2 t2, a5, x4, t2
LD a5, AO2, 4 * SIZE
MADD1 t3, a7, x3, t3
MADD2 t4, a7, x4, t4
LD a7, AO2, 6 * SIZE
MADD3 t1, a6, x4, t1
MADD4 t2, a6, x3, t2
LD a6, AO2, 5 * SIZE
MADD3 t3, a8, x4, t3
addi.d I, I, -1
MADD4 t4, a8, x3, t4
LD a8, AO2, 7 * SIZE
bge $r0, I, .L13
.align 3
.L12:
MADD1 t5, a1, x1, y1
LD y1, YY, 8 * SIZE
MADD2 t6, a1, x2, y2
LD a1, AO1, 8 * SIZE
MADD1 t7, a3, x1, y3
LD y2, YY, 9 * SIZE
MADD2 t8, a3, x2, y4
LD a3, AO1, 10 * SIZE
MADD3 t5, a2, x2, t5
LD y3, YY, 10 * SIZE
MADD4 t6, a2, x1, t6
LD a2, AO1, 9 * SIZE
MADD3 t7, a4, x2, t7
LD y4, YY, 11 * SIZE
MADD4 t8, a4, x1, t8
LD a4, AO1, 11 * SIZE
MADD1 t5, a5, x3, t5
ST t1, YY, 0 * SIZE
MADD2 t6, a5, x4, t6
LD a5, AO2, 8 * SIZE
MADD1 t7, a7, x3, t7
ST t2, YY, 1 * SIZE
MADD2 t8, a7, x4, t8
LD a7, AO2, 10 * SIZE
MADD3 t5, a6, x4, t5
ST t3, YY, 2 * SIZE
MADD4 t6, a6, x3, t6
LD a6, AO2, 9 * SIZE
MADD3 t7, a8, x4, t7
ST t4, YY, 3 * SIZE
MADD4 t8, a8, x3, t8
LD a8, AO2, 11 * SIZE
MADD1 t1, a1, x1, y1
LD y1, YY, 12 * SIZE
MADD2 t2, a1, x2, y2
LD a1, AO1, 12 * SIZE
MADD1 t3, a3, x1, y3
LD y2, YY, 13 * SIZE
MADD2 t4, a3, x2, y4
LD a3, AO1, 14 * SIZE
MADD3 t1, a2, x2, t1
LD y3, YY, 14 * SIZE
MADD4 t2, a2, x1, t2
LD a2, AO1, 13 * SIZE
MADD3 t3, a4, x2, t3
LD y4, YY, 15 * SIZE
MADD4 t4, a4, x1, t4
LD a4, AO1, 15 * SIZE
MADD1 t1, a5, x3, t1
ST t5, YY, 4 * SIZE
MADD2 t2, a5, x4, t2
LD a5, AO2, 12 * SIZE
MADD1 t3, a7, x3, t3
ST t6, YY, 5 * SIZE
MADD2 t4, a7, x4, t4
LD a7, AO2, 14 * SIZE
MADD3 t1, a6, x4, t1
ST t7, YY, 6 * SIZE
MADD4 t2, a6, x3, t2
LD a6, AO2, 13 * SIZE
MADD3 t3, a8, x4, t3
ST t8, YY, 7 * SIZE
MADD4 t4, a8, x3, t4
LD a8, AO2, 15 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
addi.d AO2, AO2, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
ST t1, YY, 0 * SIZE
MADD1 t1, a1, x1, y1
ST t2, YY, 1 * SIZE
MADD2 t2, a1, x2, y2
ST t3, YY, 2 * SIZE
MADD1 t3, a3, x1, y3
ST t4, YY, 3 * SIZE
MADD2 t4, a3, x2, y4
MADD3 t1, a2, x2, t1
MADD4 t2, a2, x1, t2
MADD3 t3, a4, x2, t3
MADD4 t4, a4, x1, t4
MADD1 t1, a5, x3, t1
MADD2 t2, a5, x4, t2
MADD1 t3, a7, x3, t3
MADD2 t4, a7, x4, t4
MADD3 t1, a6, x4, t1
addi.d AO1, AO1, 8 * SIZE
MADD4 t2, a6, x3, t2
addi.d AO2, AO2, 8 * SIZE
MADD3 t3, a8, x4, t3
addi.d YY, YY, 8 * SIZE
MADD4 t4, a8, x3, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L15:
andi I, M, 2
bge $r0, I, .L16
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
MADD1 t1, a1, x1, y1
LD a5, AO2, 0 * SIZE
MADD2 t2, a1, x2, y2
LD a6, AO2, 1 * SIZE
MADD1 t3, a3, x1, y3
LD a7, AO2, 2 * SIZE
MADD2 t4, a3, x2, y4
LD a8, AO2, 3 * SIZE
MADD3 t1, a2, x2, t1
MADD4 t2, a2, x1, t2
MADD3 t3, a4, x2, t3
MADD4 t4, a4, x1, t4
MADD1 t1, a5, x3, t1
MADD2 t2, a5, x4, t2
MADD1 t3, a7, x3, t3
MADD2 t4, a7, x4, t4
MADD3 t1, a6, x4, t1
addi.d YY, YY, 4 * SIZE
MADD4 t2, a6, x3, t2
addi.d AO1, AO1, 4 * SIZE
MADD3 t3, a8, x4, t3
addi.d AO2, AO2, 4 * SIZE
MADD4 t4, a8, x3, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L16:
andi I, M, 1
bge $r0, I, .L19
LD y1, YY, 0 * SIZE
LD y2, YY, 1 * SIZE
LD a1, AO1, 0 * SIZE
LD a2, AO1, 1 * SIZE
MADD1 t1, a1, x1, y1
LD a5, AO2, 0 * SIZE
MADD2 t2, a1, x2, y2
LD a6, AO2, 1 * SIZE
MADD3 t1, a2, x2, t1
MADD4 t2, a2, x1, t2
MADD1 t1, a5, x3, t1
MADD2 t2, a5, x4, t2
MADD3 t1, a6, x4, t1
MADD4 t2, a6, x3, t2
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
.align 3
.L19:
addi.d J, J, -1
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
bge $r0, J, .L900
LD x1, X, 0 * SIZE
LD x2, X, 1 * SIZE
add.d X, X, INCX
MUL a1, ALPHA_R, x1
move AO1, A
MUL a2, ALPHA_I, x1
#ifndef XCONJ
NMSUB x1, x2, ALPHA_I, a1
MADD x2, x2, ALPHA_R, a2
#else
MADD x1, x2, ALPHA_I, a1
MSUB x2, x2, ALPHA_R, a2
#endif
srai.d I, M, 2
move YY, YORIG
bge $r0, I, .L25
LD y1, YY, 0 * SIZE
LD a1, AO1, 0 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a2, AO1, 1 * SIZE
LD y4, YY, 3 * SIZE
LD a4, AO1, 3 * SIZE
MADD1 t1, a1, x1, y1
LD y1, YY, 4 * SIZE
MADD2 t2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD1 t3, a3, x1, y3
LD y2, YY, 5 * SIZE
MADD2 t4, a3, x2, y4
LD a3, AO1, 6 * SIZE
MADD3 t1, a2, x2, t1
LD y3, YY, 6 * SIZE
MADD4 t2, a2, x1, t2
LD a2, AO1, 5 * SIZE
MADD3 t3, a4, x2, t3
LD y4, YY, 7 * SIZE
MADD4 t4, a4, x1, t4
addi.d I, I, -1
LD a4, AO1, 7 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD1 t5, a1, x1, y1
LD y1, YY, 8 * SIZE
MADD2 t6, a1, x2, y2
LD a1, AO1, 8 * SIZE
MADD1 t7, a3, x1, y3
LD y2, YY, 9 * SIZE
MADD2 t8, a3, x2, y4
LD a3, AO1, 10 * SIZE
MADD3 t5, a2, x2, t5
LD y3, YY, 10 * SIZE
MADD4 t6, a2, x1, t6
LD a2, AO1, 9 * SIZE
MADD3 t7, a4, x2, t7
LD y4, YY, 11 * SIZE
MADD4 t8, a4, x1, t8
LD a4, AO1, 11 * SIZE
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
ST t3, YY, 2 * SIZE
ST t4, YY, 3 * SIZE
MADD1 t1, a1, x1, y1
LD y1, YY, 12 * SIZE
MADD2 t2, a1, x2, y2
LD a1, AO1, 12 * SIZE
MADD1 t3, a3, x1, y3
LD y2, YY, 13 * SIZE
MADD2 t4, a3, x2, y4
LD a3, AO1, 14 * SIZE
MADD3 t1, a2, x2, t1
LD y3, YY, 14 * SIZE
MADD4 t2, a2, x1, t2
LD a2, AO1, 13 * SIZE
MADD3 t3, a4, x2, t3
LD y4, YY, 15 * SIZE
MADD4 t4, a4, x1, t4
LD a4, AO1, 15 * SIZE
ST t5, YY, 4 * SIZE
ST t6, YY, 5 * SIZE
ST t7, YY, 6 * SIZE
ST t8, YY, 7 * SIZE
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
ST t1, YY, 0 * SIZE
MADD1 t1, a1, x1, y1
ST t2, YY, 1 * SIZE
MADD2 t2, a1, x2, y2
ST t3, YY, 2 * SIZE
MADD1 t3, a3, x1, y3
ST t4, YY, 3 * SIZE
MADD2 t4, a3, x2, y4
MADD3 t1, a2, x2, t1
addi.d AO1, AO1, 8 * SIZE
MADD4 t2, a2, x1, t2
addi.d YY, YY, 8 * SIZE
MADD3 t3, a4, x2, t3
MADD4 t4, a4, x1, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L25:
andi I, M, 2
bge $r0, I, .L26
LD a1, AO1, 0 * SIZE
LD y1, YY, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD y2, YY, 1 * SIZE
LD a3, AO1, 2 * SIZE
LD y3, YY, 2 * SIZE
LD a4, AO1, 3 * SIZE
LD y4, YY, 3 * SIZE
MADD1 t1, a1, x1, y1
MADD2 t2, a1, x2, y2
MADD1 t3, a3, x1, y3
MADD2 t4, a3, x2, y4
MADD3 t1, a2, x2, t1
addi.d YY, YY, 4 * SIZE
MADD4 t2, a2, x1, t2
addi.d AO1, AO1, 4 * SIZE
MADD3 t3, a4, x2, t3
MADD4 t4, a4, x1, t4
ST t1, YY, -4 * SIZE
ST t2, YY, -3 * SIZE
ST t3, YY, -2 * SIZE
ST t4, YY, -1 * SIZE
.align 3
.L26:
andi I, M, 1
bge $r0, I, .L900
LD y1, YY, 0 * SIZE
LD y2, YY, 1 * SIZE
LD a1, AO1, 0 * SIZE
LD a2, AO1, 1 * SIZE
MADD1 t1, a1, x1, y1
MADD2 t2, a1, x2, y2
MADD3 t1, a2, x2, t1
MADD4 t2, a2, x1, t2
ST t1, YY, 0 * SIZE
ST t2, YY, 1 * SIZE
.align 3
.L900:
li YORIG, 2 * SIZE
srai.d I, M, 2
beq INCY, YORIG, .L999
move XX, BUFFER
bge $r0, I, .L905
.align 3
.L902:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
LD a3, XX, 2 * SIZE
LD a4, XX, 3 * SIZE
LD a5, XX, 4 * SIZE
LD a6, XX, 5 * SIZE
LD a7, XX, 6 * SIZE
LD a8, XX, 7 * SIZE
addi.d I, I, -1
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
ST a3, Y, 0 * SIZE
ST a4, Y, 1 * SIZE
add.d Y, Y, INCY
ST a5, Y, 0 * SIZE
ST a6, Y, 1 * SIZE
add.d Y, Y, INCY
ST a7, Y, 0 * SIZE
ST a8, Y, 1 * SIZE
add.d Y, Y, INCY
addi.d XX, XX, 8 * SIZE
blt $r0, I, .L902
.align 3
.L905:
andi I, M, 3
bge $r0, I, .L999
.align 3
.L906:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
addi.d XX, XX, 2 * SIZE
addi.d I, I, -1
ST a1, Y, 0 * SIZE
ST a2, Y, 1 * SIZE
add.d Y, Y, INCY
blt $r0, I, .L906
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
fld.d $f24, $sp, 16
fld.d $f25, $sp, 24
#ifndef __64BIT__
fld.d $f18, $sp, 32
fld.d $f19, $sp, 40
fld.d $f20, $sp, 48
fld.d $f21, $sp, 56
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 32
#else
addi.d $sp, $sp, 64
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,556 @@
/***************************************************************************
Copyright (c) 2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define M $r4
#define N $r5
#define A $r7
#define LDA $r8
#define X $r9
#define INCX $r10
#define Y $r11
#define INCY $r6
#define BUFFER $r17
#define XORIG $r18
#define XX $r12
#define YY $r13
#define I $r14
#define J $r15
#define AO1 $r23
#define AO2 $r24
#define ALPHA_R $f0
#define ALPHA_I $f1
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define y1 $f14
#define y2 $f15
#define y3 $f16
#define y4 $f17
#define x1 $f3
#define x2 $f4
#define x3 $f2
#define x4 $f5
#define x5 $f6
#define x6 $f7
#define x7 $f18
#define x8 $f19
#if !defined(CONJ) && !defined(XCONJ)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 NMSUB
#define MADD4 MADD
#endif
#if defined(CONJ) && !defined(XCONJ)
#define MADD1 MADD
#define MADD2 MADD
#define MADD3 MADD
#define MADD4 NMSUB
#endif
#if !defined(CONJ) && defined(XCONJ)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 MADD
#define MADD4 MADD
#endif
#if defined(CONJ) && defined(XCONJ)
#define MADD1 MADD
#define MADD2 NMSUB
#define MADD3 NMSUB
#define MADD4 NMSUB
#endif
PROLOGUE
LDARG INCY, $sp, 0
LDARG BUFFER, $sp, 8
#ifdef __64BIT__
addi.d $sp, $sp, -16
#else
addi.d $sp, $sp, -32
#endif
MTC y1, $r0
SDARG $r23, $sp, 0
SDARG $r24, $sp, 8
slli.d LDA, LDA, ZBASE_SHIFT
#ifndef __64BIT__
fst.d $f18, $sp, 16
fst.d $f19, $sp, 24
#endif
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, M, .L999
slli.d INCY, INCY, ZBASE_SHIFT
bge $r0, N, .L999
li I, 2 * SIZE
move XORIG, X
beq INCX, I, .L10
srai.d I, M, 2
move XORIG, BUFFER
move YY, BUFFER
bge $r0, I, .L05
.align 3
.L02:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
add.d X, X, INCX
addi.d I, I, -1
addi.d YY, YY, 8 * SIZE
ST a1, YY, -8 * SIZE
ST a2, YY, -7 * SIZE
ST a3, YY, -6 * SIZE
ST a4, YY, -5 * SIZE
ST a5, YY, -4 * SIZE
ST a6, YY, -3 * SIZE
ST a7, YY, -2 * SIZE
ST a8, YY, -1 * SIZE
blt $r0, I, .L02
.align 3
.L05:
andi I, M, 3
bge $r0, I, .L10
.align 3
.L06:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
addi.d I, I, -1
addi.d YY, YY, 2 * SIZE
blt $r0, I, .L06
.align 3
.L10:
srai.d J, N, 1
move YY, Y
bge $r0, J, .L20
.align 3
.L11:
move AO1, A
MOV y2, y1
add.d AO2, A, LDA
MOV y3, y1
add.d A, AO2, LDA
MOV y4, y1
srai.d I, M, 2
move XX, XORIG
bge $r0, I, .L15
LD x1, XX, 0 * SIZE
LD x2, XX, 1 * SIZE
LD x4, XX, 3 * SIZE
LD a1, AO1, 0 * SIZE
LD a3, AO2, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD a4, AO2, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD a7, AO2, 2 * SIZE
LD a6, AO1, 3 * SIZE
LD a8, AO2, 3 * SIZE
addi.d I, I, -1
bge $r0, I, .L13
.align 3
.L12:
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
LD a3, AO2, 4 * SIZE
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD3 y3, a4, x2, y3
LD x2, XX, 5 * SIZE
MADD4 y4, a4, x1, y4
LD a4, AO2, 5 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 4 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 6 * SIZE
MADD1 y3, a7, x3, y3
MADD2 y4, a7, x4, y4
LD a7, AO2, 6 * SIZE
MADD3 y1, a6, x4, y1
addi.d I, I, -1
MADD4 y2, a6, x3, y2
LD a6, AO1, 7 * SIZE
MADD3 y3, a8, x4, y3
LD x4, XX, 7 * SIZE
MADD4 y4, a8, x3, y4
LD a8, AO2, 7 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 6 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 8 * SIZE
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
LD a3, AO2, 8 * SIZE
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
LD a2, AO1, 9 * SIZE
MADD3 y3, a4, x2, y3
LD x2, XX, 9 * SIZE
MADD4 y4, a4, x1, y4
LD a4, AO2, 9 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 8 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 10 * SIZE
MADD1 y3, a7, x3, y3
addi.d XX, XX, 8 * SIZE
MADD2 y4, a7, x4, y4
LD a7, AO2, 10 * SIZE
MADD3 y1, a6, x4, y1
addi.d AO2, AO2, 8 * SIZE
MADD4 y2, a6, x3, y2
LD a6, AO1, 11 * SIZE
MADD3 y3, a8, x4, y3
LD x4, XX, 3 * SIZE
MADD4 y4, a8, x3, y4
LD a8, AO2, 3 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L12
.align 3
.L13:
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
LD a3, AO2, 4 * SIZE
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
LD a2, AO1, 5 * SIZE
MADD3 y3, a4, x2, y3
LD x2, XX, 5 * SIZE
MADD4 y4, a4, x1, y4
LD a4, AO2, 5 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 4 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 6 * SIZE
MADD1 y3, a7, x3, y3
MADD2 y4, a7, x4, y4
LD a7, AO2, 6 * SIZE
MADD3 y1, a6, x4, y1
MADD4 y2, a6, x3, y2
LD a6, AO1, 7 * SIZE
MADD3 y3, a8, x4, y3
LD x4, XX, 7 * SIZE
MADD4 y4, a8, x3, y4
LD a8, AO2, 7 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 6 * SIZE
MADD2 y2, a1, x2, y2
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
MADD3 y3, a4, x2, y3
MADD4 y4, a4, x1, y4
MADD1 y1, a5, x3, y1
MADD2 y2, a5, x4, y2
MADD1 y3, a7, x3, y3
MADD2 y4, a7, x4, y4
MADD3 y1, a6, x4, y1
addi.d XX, XX, 8 * SIZE
MADD4 y2, a6, x3, y2
addi.d AO1, AO1, 8 * SIZE
MADD3 y3, a8, x4, y3
addi.d AO2, AO2, 8 * SIZE
MADD4 y4, a8, x3, y4
.align 3
.L15:
andi I, M, 2
bge $r0, I, .L17
LD x1, XX, 0 * SIZE
LD x2, XX, 1 * SIZE
LD x3, XX, 2 * SIZE
LD x4, XX, 3 * SIZE
LD a1, AO1, 0 * SIZE
LD a3, AO2, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD a4, AO2, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD a7, AO2, 2 * SIZE
LD a6, AO1, 3 * SIZE
LD a8, AO2, 3 * SIZE
MADD1 y1, a1, x1, y1
MADD2 y2, a1, x2, y2
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
MADD3 y3, a4, x2, y3
MADD4 y4, a4, x1, y4
MADD1 y1, a5, x3, y1
MADD2 y2, a5, x4, y2
MADD1 y3, a7, x3, y3
MADD2 y4, a7, x4, y4
MADD3 y1, a6, x4, y1
addi.d XX, XX, 4 * SIZE
MADD4 y2, a6, x3, y2
addi.d AO1, AO1, 4 * SIZE
MADD3 y3, a8, x4, y3
addi.d AO2, AO2, 4 * SIZE
MADD4 y4, a8, x3, y4
.align 3
.L17:
andi I, M, 1
.align 3
bge $r0, I, .L19
.L18:
LD x1, XX, 0 * SIZE
LD x2, XX, 1 * SIZE
LD a1, AO1, 0 * SIZE
LD a3, AO2, 0 * SIZE
MADD1 y1, a1, x1, y1
LD a2, AO1, 1 * SIZE
MADD2 y2, a1, x2, y2
LD a4, AO2, 1 * SIZE
MADD1 y3, a3, x1, y3
MADD2 y4, a3, x2, y4
MADD3 y1, a2, x2, y1
MADD4 y2, a2, x1, y2
MADD3 y3, a4, x2, y3
MADD4 y4, a4, x1, y4
.align 3
.L19:
LD a1, Y, 0 * SIZE
LD a2, Y, 1 * SIZE
add.d Y, Y, INCY
LD a3, Y, 0 * SIZE
LD a4, Y, 1 * SIZE
add.d Y, Y, INCY
MADD a1, y1, ALPHA_R, a1
MADD a2, y1, ALPHA_I, a2
MADD a3, y3, ALPHA_R, a3
MADD a4, y3, ALPHA_I, a4
NMSUB a1, y2, ALPHA_I, a1
MADD a2, y2, ALPHA_R, a2
NMSUB a3, y4, ALPHA_I, a3
MTC y1, $r0
MADD a4, y4, ALPHA_R, a4
addi.d J, J, -1
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
add.d YY, YY, INCY
ST a3, YY, 0 * SIZE
ST a4, YY, 1 * SIZE
add.d YY, YY, INCY
blt $r0, J, .L11
.align 3
.L20:
andi J, N, 1
MOV y2, y1
srai.d I, M, 2
bge $r0, J, .L999
MOV y3, y1
move AO1, A
MOV y4, y1
move XX, XORIG
bge $r0, I, .L25
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
LD x4, XX, 3 * SIZE
addi.d I, I, -1
LD a6, AO1, 3 * SIZE
bge $r0, I, .L23
.align 3
.L22:
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD3 y3, a2, x2, y3
LD x2, XX, 5 * SIZE
MADD4 y4, a2, x1, y4
LD a2, AO1, 5 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 4 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 6 * SIZE
MADD3 y3, a6, x4, y3
LD x4, XX, 7 * SIZE
MADD4 y4, a6, x3, y4
LD a6, AO1, 7 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 6 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 8 * SIZE
MADD3 y3, a2, x2, y3
LD x2, XX, 9 * SIZE
MADD4 y4, a2, x1, y4
LD a2, AO1, 9 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 8 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 10 * SIZE
MADD3 y3, a6, x4, y3
LD x4, XX, 11 * SIZE
MADD4 y4, a6, x3, y4
LD a6, AO1, 11 * SIZE
addi.d I, I, -1
addi.d XX, XX, 8 * SIZE
addi.d AO1, AO1, 8 * SIZE
blt $r0, I, .L22
.align 3
.L23:
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a1, AO1, 4 * SIZE
MADD3 y3, a2, x2, y3
LD x2, XX, 5 * SIZE
MADD4 y4, a2, x1, y4
LD a2, AO1, 5 * SIZE
MADD1 y1, a5, x3, y1
LD x1, XX, 4 * SIZE
MADD2 y2, a5, x4, y2
LD a5, AO1, 6 * SIZE
MADD3 y3, a6, x4, y3
LD x4, XX, 7 * SIZE
MADD4 y4, a6, x3, y4
LD a6, AO1, 7 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 6 * SIZE
MADD2 y2, a1, x2, y2
MADD3 y3, a2, x2, y3
MADD4 y4, a2, x1, y4
MADD1 y1, a5, x3, y1
MADD2 y2, a5, x4, y2
MADD3 y3, a6, x4, y3
addi.d XX, XX, 8 * SIZE
MADD4 y4, a6, x3, y4
addi.d AO1, AO1, 8 * SIZE
.align 3
.L25:
andi I, M, 2
bge $r0, I, .L27
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
LD a5, AO1, 2 * SIZE
MADD1 y1, a1, x1, y1
LD x3, XX, 2 * SIZE
MADD2 y2, a1, x2, y2
LD a6, AO1, 3 * SIZE
MADD3 y3, a2, x2, y3
LD x4, XX, 3 * SIZE
MADD4 y4, a2, x1, y4
MADD1 y1, a5, x3, y1
MADD2 y2, a5, x4, y2
MADD3 y3, a6, x4, y3
addi.d XX, XX, 4 * SIZE
MADD4 y4, a6, x3, y4
addi.d AO1, AO1, 4 * SIZE
.align 3
.L27:
andi I, M, 1
.align 3
bge $r0, I, .L29
.L28:
LD a1, AO1, 0 * SIZE
LD x1, XX, 0 * SIZE
LD a2, AO1, 1 * SIZE
LD x2, XX, 1 * SIZE
MADD1 y1, a1, x1, y1
MADD2 y2, a1, x2, y2
MADD3 y3, a2, x2, y3
MADD4 y4, a2, x1, y4
.align 3
.L29:
LD a1, Y, 0 * SIZE
LD a2, Y, 1 * SIZE
ADD y1, y1, y3
ADD y2, y2, y4
MADD a1, y1, ALPHA_R, a1
MADD a2, y1, ALPHA_I, a2
NMSUB a1, y2, ALPHA_I, a1
MADD a2, y2, ALPHA_R, a2
ST a1, YY, 0 * SIZE
ST a2, YY, 1 * SIZE
.align 3
.L999:
LDARG $r23, $sp, 0
LDARG $r24, $sp, 8
#ifndef __64BIT__
fld.d $f18, $sp, 16
fld.d $f19, $sp, 24
#endif
#ifdef __64BIT__
addi.d $sp, $sp, 16
#else
addi.d $sp, $sp, 32
#endif
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

304
kernel/loongarch64/znrm2.S Normal file
View File

@ -0,0 +1,304 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r5
#define INCX $r6
#define XX $r7
#define I $r17
#define TEMP $r18
#define a1 $f10
#define a2 $f11
#define a3 $f12
#define a4 $f13
#define a5 $f14
#define a6 $f15
#define a7 $f16
#define a8 $f17
#define t1 $f0
#define t2 $f1
#define t3 $f2
#define t4 $f3
#define s1 $f22
#define s2 $f8
#define s3 $f23
#define s4 $f9
#define ALPHA $f4
#define max $f5
PROLOGUE
#ifdef F_INTERFACE
LDINT N, 0(N)
LDINT INCX, 0(INCX)
#endif
MTC s1, $r0
bge $r0, N, .L999
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, INCX, .L999
move XX, X
MOV s2, s1
srai.d I, N, 2
MOV s3, s1
MOV s4, s1
bge $r0, I, .L15
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
LD a7, X, 0 * SIZE
LD a8, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
bge $r0, I, .L13
.align 3
.L12:
FABS t1, a1
LD a1, X, 0 * SIZE
FABS t2, a2
NOP
FABS t3, a3
LD a2, X, 1 * SIZE
FABS t4, a4
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a3, X, 0 * SIZE
CMPLT $fcc1, s2, t2
NOP
CMPLT $fcc2, s3, t3
LD a4, X, 1 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
LD a5, X, 0 * SIZE
FABS t2, a6
NOP
FABS t3, a7
LD a6, X, 1 * SIZE
FABS t4, a8
add.d X, X, INCX
CMPLT $fcc0, s1, t1
LD a7, X, 0 * SIZE
CMPLT $fcc1, s2, t2
NOP
CMPLT $fcc2, s3, t3
LD a8, X, 1 * SIZE
CMPLT $fcc3, s4, t4
add.d X, X, INCX
CMOVT s1, s1, t1, $fcc0
addi.d I, I, -1
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
blt $r0, I, .L12
.align 3
.L13:
FABS t1, a1
FABS t2, a2
FABS t3, a3
FABS t4, a4
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
FABS t1, a5
FABS t2, a6
FABS t3, a7
FABS t4, a8
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMPLT $fcc2, s3, t3
CMPLT $fcc3, s4, t4
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
CMOVT s3, s3, t3, $fcc2
CMOVT s4, s4, t4, $fcc3
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L100
.align 3
.L16:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
addi.d I, I, -1
FABS t1, a1
FABS t2, a2
CMPLT $fcc0, s1, t1
CMPLT $fcc1, s2, t2
CMOVT s1, s1, t1, $fcc0
CMOVT s2, s2, t2, $fcc1
add.d X, X, INCX
blt $r0, I, .L16
.align 3
.L100:
CMPLT $fcc0, s1, s2
CMPLT $fcc1, s3, s4
CMOVT s1, s1, s2, $fcc0
CMOVT s3, s3, s4, $fcc1
CMPLT $fcc0, s1, s3
CMOVT s1, s1, s3, $fcc0
lu12i.w TEMP, 0x3f800
movgr2fr.d a1, $r0
movgr2fr.w ALPHA, TEMP
CMPEQ $fcc0, s1, a1
fcvt.d.s ALPHA, ALPHA
bcnez $fcc0, .L999
fdiv.d ALPHA, ALPHA, s1
MOV max, s1
MOV s1, a1
MOV s2, a1
MOV s3, a1
MOV s4, a1
srai.d I, N, 2
bge $r0, I, .L105
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
add.d XX, XX, INCX
LD a3, XX, 0 * SIZE
LD a4, XX, 1 * SIZE
add.d XX, XX, INCX
LD a5, XX, 0 * SIZE
LD a6, XX, 1 * SIZE
add.d XX, XX, INCX
LD a7, XX, 0 * SIZE
LD a8, XX, 1 * SIZE
addi.d I, I, -1
add.d XX, XX, INCX
bge $r0, I, .L104
.align 3
.L103:
MUL t1, ALPHA, a1
LD a1, XX, 0 * SIZE
MUL t2, ALPHA, a2
addi.d I, I, -1
MUL t3, ALPHA, a3
LD a2, XX, 1 * SIZE
MUL t4, ALPHA, a4
add.d XX, XX, INCX
MADD s1, t1, t1, s1
LD a3, XX, 0 * SIZE
MADD s2, t2, t2, s2
NOP
MADD s3, t3, t3, s3
LD a4, XX, 1 * SIZE
MADD s4, t4, t4, s4
add.d XX, XX, INCX
MUL t1, ALPHA, a5
LD a5, XX, 0 * SIZE
MUL t2, ALPHA, a6
NOP
MUL t3, ALPHA, a7
LD a6, XX, 1 * SIZE
MUL t4, ALPHA, a8
add.d XX, XX, INCX
MADD s1, t1, t1, s1
LD a7, XX, 0 * SIZE
MADD s2, t2, t2, s2
LD a8, XX, 1 * SIZE
MADD s3, t3, t3, s3
add.d XX, XX, INCX
MADD s4, t4, t4, s4
blt $r0, I, .L103
.align 3
.L104:
MUL t1, ALPHA, a1
MUL t2, ALPHA, a2
MUL t3, ALPHA, a3
MUL t4, ALPHA, a4
MADD s1, t1, t1, s1
MADD s2, t2, t2, s2
MADD s3, t3, t3, s3
MADD s4, t4, t4, s4
MUL t1, ALPHA, a5
MUL t2, ALPHA, a6
MUL t3, ALPHA, a7
MUL t4, ALPHA, a8
MADD s1, t1, t1, s1
MADD s2, t2, t2, s2
MADD s3, t3, t3, s3
MADD s4, t4, t4, s4
.align 3
.L105:
andi I, N, 3
bge $r0, I, .L998
.align 3
.L106:
LD a1, XX, 0 * SIZE
LD a2, XX, 1 * SIZE
addi.d I, I, -1
MUL t1, ALPHA, a1
MUL t2, ALPHA, a2
MADD s1, t1, t1, s1
add.d XX, XX, INCX
MADD s2, t2, t2, s2
blt $r0, I, .L106
.align 3
.L998:
ADD s1, s1, s2
ADD s3, s3, s4
ADD s1, s1, s3
fsqrt.d s1, s1
move $r4, $r17
MUL $f0, max, s1
jirl $r0, $r1, 0x0
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

356
kernel/loongarch64/zscal.S Normal file
View File

@ -0,0 +1,356 @@
/***************************************************************************
Copyright (c) 2021, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define ASSEMBLER
#include "common.h"
#define N $r4
#define X $r7
#define INCX $r8
#define I $r17
#define TEMP $r18
#define XX $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define a1 $f22
#define a2 $f8
#define a3 $f23
#define a4 $f9
#define a5 $f10
#define a6 $f11
#define a7 $f12
#define a8 $f13
#define t1 $f14
#define t2 $f15
#define t3 $f16
#define t4 $f17
PROLOGUE
li TEMP, 2 * SIZE
MTC a1, $r0
slli.d INCX, INCX, ZBASE_SHIFT
bge $r0, N, .L999
CMPEQ $fcc0, ALPHA_R, a1
CMPEQ $fcc1, ALPHA_I, a1
bceqz $fcc0, .L50
bceqz $fcc1, .L50
srai.d I, N, 2
bne INCX, TEMP, .L20
bge $r0, I, .L15
.align 3
.L12:
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
ST a1, X, 2 * SIZE
ST a1, X, 3 * SIZE
ST a1, X, 4 * SIZE
ST a1, X, 5 * SIZE
ST a1, X, 6 * SIZE
ST a1, X, 7 * SIZE
addi.w I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L12
.align 3
.L15:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L16:
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
addi.d I, I, -1
addi.d X, X, 2 * SIZE
blt $r0, I, .L16
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L20:
srai.d I, N, 2
bge $r0, I, .L25
.align 3
.L22:
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
add.d X, X, INCX
ST a1, X, 0 * SIZE
ST a1, X, 1 * SIZE
addi.d I, I, -1
add.d X, X, INCX
blt $r0, I, .L22
.align 3
.L25:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L26:
ST a1, X, 0 * SIZE
addi.d I, I, -1
ST a1, X, 1 * SIZE
add.d X, X, INCX
blt $r0, I, .L26
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L50:
srai.d I, N, 2
bne INCX, TEMP, .L60
addi.d I, I, -1
blt I, $r0, .L55
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
LD a3, X, 2 * SIZE
LD a4, X, 3 * SIZE
LD a5, X, 4 * SIZE
LD a6, X, 5 * SIZE
MUL t1, ALPHA_R, a1
LD a7, X, 6 * SIZE
MUL t2, ALPHA_I, a1
LD a8, X, 7 * SIZE
MUL t3, ALPHA_R, a3
MUL t4, ALPHA_I, a3
bge $r0, I, .L53
.align 3
.L52:
NMSUB t1, a2, ALPHA_I, t1
LD a1, X, 8 * SIZE
MADD t2, a2, ALPHA_R, t2
LD a2, X, 9 * SIZE
NMSUB t3, a4, ALPHA_I, t3
LD a3, X, 10 * SIZE
MADD t4, a4, ALPHA_R, t4
LD a4, X, 11 * SIZE
ST t1, X, 0 * SIZE
MUL t1, ALPHA_R, a5
ST t2, X, 1 * SIZE
MUL t2, ALPHA_I, a5
ST t3, X, 2 * SIZE
MUL t3, ALPHA_R, a7
ST t4, X, 3 * SIZE
MUL t4, ALPHA_I, a7
NMSUB t1, a6, ALPHA_I, t1
LD a5, X, 12 * SIZE
MADD t2, a6, ALPHA_R, t2
LD a6, X, 13 * SIZE
NMSUB t3, a8, ALPHA_I, t3
LD a7, X, 14 * SIZE
MADD t4, a8, ALPHA_R, t4
LD a8, X, 15 * SIZE
ST t1, X, 4 * SIZE
MUL t1, ALPHA_R, a1
ST t2, X, 5 * SIZE
MUL t2, ALPHA_I, a1
ST t3, X, 6 * SIZE
MUL t3, ALPHA_R, a3
ST t4, X, 7 * SIZE
MUL t4, ALPHA_I, a3
addi.d I, I, -1
addi.d X, X, 8 * SIZE
blt $r0, I, .L52
.align 3
.L53:
NMSUB t1, a2, ALPHA_I, t1
MADD t2, a2, ALPHA_R, t2
NMSUB t3, a4, ALPHA_I, t3
MADD t4, a4, ALPHA_R, t4
ST t1, X, 0 * SIZE
MUL t1, ALPHA_R, a5
ST t2, X, 1 * SIZE
MUL t2, ALPHA_I, a5
ST t3, X, 2 * SIZE
MUL t3, ALPHA_R, a7
ST t4, X, 3 * SIZE
MUL t4, ALPHA_I, a7
NMSUB t1, a6, ALPHA_I, t1
MADD t2, a6, ALPHA_R, t2
NMSUB t3, a8, ALPHA_I, t3
MADD t4, a8, ALPHA_R, t4
ST t1, X, 4 * SIZE
ST t2, X, 5 * SIZE
ST t3, X, 6 * SIZE
ST t4, X, 7 * SIZE
addi.d X, X, 8 * SIZE
.align 3
.L55:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L56:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
MUL t1, ALPHA_R, a1
MUL t2, ALPHA_I, a1
NMSUB t1, a2, ALPHA_I, t1
MADD t2, a2, ALPHA_R, t2
addi.d X, X, 2 * SIZE
addi.d I, I, -1
ST t1, X, -2 * SIZE
ST t2, X, -1 * SIZE
blt $r0, I, .L56
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
.align 3
.L60:
srai.d I, N, 2
move XX, X
addi.d I, I, -1
blt I, $r0, .L65
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
add.d X, X, INCX
LD a3, X, 0 * SIZE
LD a4, X, 1 * SIZE
add.d X, X, INCX
LD a5, X, 0 * SIZE
LD a6, X, 1 * SIZE
add.d X, X, INCX
MUL t1, ALPHA_R, a1
LD a7, X, 0 * SIZE
MUL t2, ALPHA_I, a1
LD a8, X, 1 * SIZE
MUL t3, ALPHA_R, a3
add.d X, X, INCX
MUL t4, ALPHA_I, a3
bge $r0, I, .L63
.align 3
.L62:
NMSUB t1, a2, ALPHA_I, t1
LD a1, X, 0 * SIZE
MADD t2, a2, ALPHA_R, t2
LD a2, X, 1 * SIZE
add.d X, X, INCX
NMSUB t3, a4, ALPHA_I, t3
LD a3, X, 0 * SIZE
MADD t4, a4, ALPHA_R, t4
LD a4, X, 1 * SIZE
add.d X, X, INCX
ST t1, XX, 0 * SIZE
MUL t1, ALPHA_R, a5
ST t2, XX, 1 * SIZE
MUL t2, ALPHA_I, a5
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
MUL t3, ALPHA_R, a7
ST t4, XX, 1 * SIZE
MUL t4, ALPHA_I, a7
add.d XX, XX, INCX
NMSUB t1, a6, ALPHA_I, t1
LD a5, X, 0 * SIZE
MADD t2, a6, ALPHA_R, t2
LD a6, X, 1 * SIZE
add.d X, X, INCX
NMSUB t3, a8, ALPHA_I, t3
LD a7, X, 0 * SIZE
MADD t4, a8, ALPHA_R, t4
LD a8, X, 1 * SIZE
add.d X, X, INCX
ST t1, XX, 0 * SIZE
MUL t1, ALPHA_R, a1
ST t2, XX, 1 * SIZE
MUL t2, ALPHA_I, a1
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
MUL t3, ALPHA_R, a3
ST t4, XX, 1 * SIZE
MUL t4, ALPHA_I, a3
addi.d I, I, -1
add.d XX, XX, INCX
blt $r0, I, .L62
.align 3
.L63:
NMSUB t1, a2, ALPHA_I, t1
MADD t2, a2, ALPHA_R, t2
NMSUB t3, a4, ALPHA_I, t3
MADD t4, a4, ALPHA_R, t4
ST t1, XX, 0 * SIZE
MUL t1, ALPHA_R, a5
ST t2, XX, 1 * SIZE
MUL t2, ALPHA_I, a5
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
MUL t3, ALPHA_R, a7
ST t4, XX, 1 * SIZE
MUL t4, ALPHA_I, a7
add.d XX, XX, INCX
NMSUB t1, a6, ALPHA_I, t1
MADD t2, a6, ALPHA_R, t2
NMSUB t3, a8, ALPHA_I, t3
MADD t4, a8, ALPHA_R, t4
ST t1, XX, 0 * SIZE
ST t2, XX, 1 * SIZE
add.d XX, XX, INCX
ST t3, XX, 0 * SIZE
ST t4, XX, 1 * SIZE
add.d XX, XX, INCX
.align 3
.L65:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L66:
LD a1, X, 0 * SIZE
LD a2, X, 1 * SIZE
MUL t1, ALPHA_R, a1
MUL t2, ALPHA_I, a1
NMSUB t1, a2, ALPHA_I, t1
MADD t2, a2, ALPHA_R, t2
addi.d I, I, -1
ST t1, X, 0 * SIZE
ST t2, X, 1 * SIZE
add.d X, X, INCX
blt $r0, I, .L66
.align 3
.L999:
move $r4, $r17
fmov.d $f0, $f22
jirl $r0, $r1, 0x0
EPILOGUE

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,12 @@
TOPDIR = ../../..
include ../../../Makefile.system
ifndef LASWP
LASWP = ../generic/laswp_k.c
endif
ifndef ZLASWP
ZLASWP = ../generic/zlaswp_k.c
endif
include ../generic/Makefile

46
param.h
View File

@ -2691,6 +2691,52 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define SYMV_P 16
#endif
#if defined (LOONGSON3R5)
#define SNUMOPT 2
#define DNUMOPT 2
#define GEMM_DEFAULT_OFFSET_A 0
#define GEMM_DEFAULT_OFFSET_B 0
#define GEMM_DEFAULT_ALIGN 0x0ffffUL
#define SGEMM_DEFAULT_UNROLL_N 8
#define DGEMM_DEFAULT_UNROLL_N 8
#define QGEMM_DEFAULT_UNROLL_N 2
#define CGEMM_DEFAULT_UNROLL_N 4
#define ZGEMM_DEFAULT_UNROLL_N 4
#define XGEMM_DEFAULT_UNROLL_N 1
#define SGEMM_DEFAULT_UNROLL_M 2
#define DGEMM_DEFAULT_UNROLL_M 2
#define QGEMM_DEFAULT_UNROLL_M 2
#define CGEMM_DEFAULT_UNROLL_M 1
#define ZGEMM_DEFAULT_UNROLL_M 1
#define XGEMM_DEFAULT_UNROLL_M 1
#define SGEMM_DEFAULT_P sgemm_p
#define DGEMM_DEFAULT_P dgemm_p
#define QGEMM_DEFAULT_P qgemm_p
#define CGEMM_DEFAULT_P cgemm_p
#define ZGEMM_DEFAULT_P zgemm_p
#define XGEMM_DEFAULT_P xgemm_p
#define SGEMM_DEFAULT_R sgemm_r
#define DGEMM_DEFAULT_R dgemm_r
#define QGEMM_DEFAULT_R qgemm_r
#define CGEMM_DEFAULT_R cgemm_r
#define ZGEMM_DEFAULT_R zgemm_r
#define XGEMM_DEFAULT_R xgemm_r
#define SGEMM_DEFAULT_Q 128
#define DGEMM_DEFAULT_Q 128
#define QGEMM_DEFAULT_Q 128
#define CGEMM_DEFAULT_Q 128
#define ZGEMM_DEFAULT_Q 128
#define XGEMM_DEFAULT_Q 128
#define SYMV_P 16
#endif
#if defined(P5600) || defined(MIPS1004K) || defined(MIPS24K) || defined(I6400) || defined(P6600) || defined(I6500)
#define SNUMOPT 2
#define DNUMOPT 2