diff --git a/Makefile.install b/Makefile.install index 7c1a3ca43..e8b64465f 100644 --- a/Makefile.install +++ b/Makefile.install @@ -9,7 +9,7 @@ OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin OPENBLAS_BUILD_DIR := $(CURDIR) -OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/openblas +OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake OPENBLAS_CMAKE_CONFIG_VERSION := OpenBLASConfigVersion.cmake OPENBLAS_PKGCONFIG_DIR := $(OPENBLAS_LIBRARY_DIR)/pkgconfig @@ -150,13 +150,13 @@ endif endif #Generating openblas.pc - @echo Generating openblas.pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" - @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" - @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/openblas.pc" + @echo Generating $(LIBSONAMEBASE).pc in "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" + @echo 'libdir='$(OPENBLAS_LIBRARY_DIR) > "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'openblas_config= USE_64BITINT='$(USE_64BITINT) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'version='$(VERSION) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @echo 'extralib='$(PKG_EXTRALIB) >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" + @cat openblas.pc.in >> "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)/$(LIBSONAMEBASE).pc" #Generating OpenBLASConfig.cmake diff --git a/Makefile.prebuild b/Makefile.prebuild index 48fb5e991..d6395da7b 100644 --- a/Makefile.prebuild +++ b/Makefile.prebuild @@ -41,6 +41,10 @@ ifeq ($(TARGET), I6500) TARGET_FLAGS = -mips64r6 endif +ifeq ($(TARGET), C910V) +TARGET_FLAGS = -march=rv64gcvxthead -mabi=lp64v +endif + all: getarch_2nd ./getarch_2nd 0 >> $(TARGET_MAKE) ./getarch_2nd 1 >> $(TARGET_CONF) diff --git a/Makefile.riscv64 b/Makefile.riscv64 new file mode 100644 index 000000000..15d7b059c --- /dev/null +++ b/Makefile.riscv64 @@ -0,0 +1,4 @@ +ifeq ($(CORE), C910V) +CCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v +FCOMMON_OPT += -march=rv64gcvxthead -mabi=lp64v -static +endif diff --git a/Makefile.system b/Makefile.system index aae7ba503..afc8ee207 100644 --- a/Makefile.system +++ b/Makefile.system @@ -751,7 +751,10 @@ endif endif endif - +ifeq ($(ARCH), riscv64) +NO_BINARY_MODE = 1 +BINARY_DEFINED = 1 +endif # @@ -833,7 +836,9 @@ endif ifndef BINARY_DEFINED ifneq ($(OSNAME), AIX) ifdef BINARY64 +ifneq ($(ARCH), riscv64) CCOMMON_OPT += -m64 +endif else CCOMMON_OPT += -m32 endif @@ -954,8 +959,10 @@ endif else ifdef BINARY64 ifneq ($(OSNAME), AIX) +ifneq ($(ARCH), riscv64) FCOMMON_OPT += -m64 endif +endif ifdef INTERFACE64 ifneq ($(INTERFACE64), 0) FCOMMON_OPT += -fdefault-integer-8 @@ -1285,10 +1292,14 @@ ifndef SYMBOLSUFFIX SYMBOLSUFFIX = endif +ifndef LIBSONAMEBASE +LIBSONAMEBASE = openblas +endif + ifndef LIBNAMESUFFIX -LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX) +LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX) else -LIBNAMEBASE = $(SYMBOLPREFIX)openblas$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) +LIBNAMEBASE = $(SYMBOLPREFIX)$(LIBSONAMEBASE)$(SYMBOLSUFFIX)_$(LIBNAMESUFFIX) endif ifeq ($(OSNAME), CYGWIN_NT) diff --git a/README.md b/README.md index ca034e747..267df5358 100644 --- a/README.md +++ b/README.md @@ -172,6 +172,13 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th - **Z13**: Optimized Level-3 BLAS and Level-1,2 - **Z14**: Optimized Level-3 BLAS and (single precision) Level-1,2 +#### RISC-V + +- **C910V**: Optimized Leve-3 BLAS (real) and Level-1,2 by RISC-V Vector extension 0.7.1. + ```sh + make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran + ``` + ### Support for multiple targets in a single library OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake. diff --git a/TargetList.txt b/TargetList.txt index 66eca4506..d19964916 100644 --- a/TargetList.txt +++ b/TargetList.txt @@ -104,3 +104,8 @@ VORTEX ZARCH_GENERIC Z13 Z14 + +10.RISC-V 64: +RISCV64_GENERIC +C910V + diff --git a/c_check b/c_check index 5ea93b75c..405963ae6 100644 --- a/c_check +++ b/c_check @@ -92,6 +92,7 @@ $architecture = ia64 if ($data =~ /ARCH_IA64/); $architecture = arm if ($data =~ /ARCH_ARM/); $architecture = arm64 if ($data =~ /ARCH_ARM64/); $architecture = zarch if ($data =~ /ARCH_ZARCH/); +$architecture = riscv64 if ($data =~ /ARCH_RISCV64/); $defined = 0; @@ -136,6 +137,11 @@ if (($architecture eq "x86") && ($os ne Darwin) && ($os ne SunOS)) { $binary =32; } +if ($architecture eq "riscv64") { + $defined = 1; + $binary = 64; +} + if ($compiler eq "PGI") { $compiler_name .= " -tp p7" if ($binary eq "32"); $compiler_name .= " -tp p7-64" if ($binary eq "64"); diff --git a/cmake/cc.cmake b/cmake/cc.cmake index b963940d6..76952152b 100644 --- a/cmake/cc.cmake +++ b/cmake/cc.cmake @@ -124,6 +124,9 @@ if (NOT DYNAMIC_ARCH) if (HAVE_AVX) set (CCOMMON_OPT "${CCOMMON_OPT} -mavx") endif () + if (HAVE_FMA3) + set (CCOMMON_OPT "${CCOMMON_OPT} -mfma") + endif () if (HAVE_SSE) set (CCOMMON_OPT "${CCOMMON_OPT} -msse") endif () diff --git a/common.h b/common.h index a3ef99b59..2825407cb 100644 --- a/common.h +++ b/common.h @@ -437,6 +437,11 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_mips.h" #endif + +#ifdef ARCH_RISCV64 +#include "common_riscv64.h" +#endif + #ifdef ARCH_MIPS64 #include "common_mips64.h" #endif diff --git a/common_riscv64.h b/common_riscv64.h new file mode 100644 index 000000000..27f385dfd --- /dev/null +++ b/common_riscv64.h @@ -0,0 +1,98 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +**********************************************************************************/ + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#ifndef COMMON_RISCV64 +#define COMMON_RISCV64 + +#define MB __sync_synchronize() +#define WMB __sync_synchronize() +#define RMB __sync_synchronize() + +#define INLINE inline + +#ifndef ASSEMBLER + + +static inline int blas_quickdivide(blasint x, blasint y){ + return x / y; +} + +#endif + + + +#define BUFFER_SIZE ( 32 << 20) +#define SEEK_ADDRESS + +#if defined(C910V) +#include +#endif + +#endif diff --git a/cpuid_riscv64.c b/cpuid_riscv64.c new file mode 100644 index 000000000..0eb50e001 --- /dev/null +++ b/cpuid_riscv64.c @@ -0,0 +1,113 @@ +/***************************************************************************** +Copyright (c) 2011-2014, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + + +/*********************************************************************/ +/* Copyright 2009, 2010 The University of Texas at Austin. */ +/* All rights reserved. */ +/* */ +/* Redistribution and use in source and binary forms, with or */ +/* without modification, are permitted provided that the following */ +/* conditions are met: */ +/* */ +/* 1. Redistributions of source code must retain the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer. */ +/* */ +/* 2. Redistributions in binary form must reproduce the above */ +/* copyright notice, this list of conditions and the following */ +/* disclaimer in the documentation and/or other materials */ +/* provided with the distribution. */ +/* */ +/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */ +/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */ +/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */ +/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */ +/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */ +/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */ +/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */ +/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */ +/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */ +/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */ +/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */ +/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */ +/* POSSIBILITY OF SUCH DAMAGE. */ +/* */ +/* The views and conclusions contained in the software and */ +/* documentation are those of the authors and should not be */ +/* interpreted as representing official policies, either expressed */ +/* or implied, of The University of Texas at Austin. */ +/*********************************************************************/ + +#define CPU_UNKNOWN 0 +#define CPU_C910V 1 + +static char *cpuname[] = { + "UNKOWN", + "C910V" +}; + +int detect(void){ + return CPU_UNKNOWN; +} + +char *get_corename(void){ + return cpuname[detect()]; +} + +void get_architecture(void){ + printf("RISCV64"); +} + +void get_subarchitecture(void){ +} + +void get_subdirname(void){ + printf("riscv64"); +} + +void get_cpuconfig(void){ + printf("#define UNKNOWN\n"); + printf("#define L1_DATA_SIZE 65536\n"); + printf("#define L1_DATA_LINESIZE 32\n"); + printf("#define L2_SIZE 512488\n"); + printf("#define L2_LINESIZE 32\n"); + printf("#define DTB_DEFAULT_ENTRIES 64\n"); + printf("#define DTB_SIZE 4096\n"); + printf("#define L2_ASSOCIATIVE 4\n"); +} + +void get_libname(void){ + printf("riscv64\n"); +} diff --git a/ctest.c b/ctest.c index cd84ab1bb..d674a8cbd 100644 --- a/ctest.c +++ b/ctest.c @@ -153,6 +153,11 @@ ARCH_ARM ARCH_ARM64 #endif +#if defined(__riscv) +ARCH_RISCV64 +#endif + #if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) HAVE_C11 #endif + diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index a8b3e9a4b..a576127aa 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -76,10 +76,28 @@ static atomic_bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; static _Bool blas_buffer_inuse[MAX_PARALLEL_NUMBER]; #endif -void goto_set_num_threads(int num_threads) { +static void adjust_thread_buffers() { int i=0, j=0; + //adjust buffer for each thread + for(i=0; i < MAX_PARALLEL_NUMBER; i++) { + for(j=0; j < blas_cpu_number; j++){ + if(blas_thread_buffer[i][j] == NULL){ + blas_thread_buffer[i][j] = blas_memory_alloc(2); + } + } + for(; j < MAX_CPU_NUMBER; j++){ + if(blas_thread_buffer[i][j] != NULL){ + blas_memory_free(blas_thread_buffer[i][j]); + blas_thread_buffer[i][j] = NULL; + } + } + } +} + +void goto_set_num_threads(int num_threads) { + if (num_threads < 1) num_threads = blas_num_threads; if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER; @@ -92,20 +110,7 @@ void goto_set_num_threads(int num_threads) { omp_set_num_threads(blas_cpu_number); - //adjust buffer for each thread - for(i=0; i 128)) #ifdef DOUBLE const int vstep = v_nlanes_f64; - const int unrollx2 = n & (-vstep * 2); + const int unrollx4 = n & (-vstep * 4); const int unrollx = n & -vstep; v_f64 vsum0 = v_zero_f64(); v_f64 vsum1 = v_zero_f64(); - while (i < unrollx2) - { - vsum0 = v_add_f64(vsum0, v_loadu_f64(x)); - vsum1 = v_add_f64(vsum1, v_loadu_f64(x + vstep)); - i += vstep * 2; - } - vsum0 = v_add_f64(vsum0, vsum1); - while (i < unrollx) + v_f64 vsum2 = v_zero_f64(); + v_f64 vsum3 = v_zero_f64(); + for (; i < unrollx4; i += vstep * 4) + { + vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); + vsum1 = v_add_f64(vsum1, v_loadu_f64(x + i + vstep)); + vsum2 = v_add_f64(vsum2, v_loadu_f64(x + i + vstep * 2)); + vsum3 = v_add_f64(vsum3, v_loadu_f64(x + i + vstep * 3)); + } + vsum0 = v_add_f64( + v_add_f64(vsum0, vsum1), v_add_f64(vsum2, vsum3)); + for (; i < unrollx; i += vstep) { vsum0 = v_add_f64(vsum0, v_loadu_f64(x + i)); - i += vstep; } sumf = v_sum_f64(vsum0); #else @@ -70,20 +73,18 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) v_f32 vsum1 = v_zero_f32(); v_f32 vsum2 = v_zero_f32(); v_f32 vsum3 = v_zero_f32(); - while (i < unrollx4) + for (; i < unrollx4; i += vstep * 4) { - vsum0 = v_add_f32(vsum0, v_loadu_f32(x)); - vsum1 = v_add_f32(vsum1, v_loadu_f32(x + vstep)); - vsum2 = v_add_f32(vsum2, v_loadu_f32(x + vstep * 2)); - vsum3 = v_add_f32(vsum3, v_loadu_f32(x + vstep * 3)); - i += vstep * 4; + vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); + vsum1 = v_add_f32(vsum1, v_loadu_f32(x + i + vstep)); + vsum2 = v_add_f32(vsum2, v_loadu_f32(x + i + vstep * 2)); + vsum3 = v_add_f32(vsum3, v_loadu_f32(x + i + vstep * 3)); } vsum0 = v_add_f32( v_add_f32(vsum0, vsum1), v_add_f32(vsum2, vsum3)); - while (i < unrollx) + for (; i < unrollx; i += vstep) { vsum0 = v_add_f32(vsum0, v_loadu_f32(x + i)); - i += vstep; } sumf = v_sum_f32(vsum0); #endif diff --git a/kernel/generic/trmmkernel_16x4.c b/kernel/generic/trmmkernel_16x4.c new file mode 100644 index 000000000..7ea4e108c --- /dev/null +++ b/kernel/generic/trmmkernel_16x4.c @@ -0,0 +1,2092 @@ +#include "common.h" + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) +{ + + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; + + FLOAT res0_0; + FLOAT res0_1; + FLOAT res0_2; + FLOAT res0_3; + FLOAT res0_4; + FLOAT res0_5; + FLOAT res0_6; + FLOAT res0_7; + + FLOAT res0_8; + FLOAT res0_9; + FLOAT res0_10; + FLOAT res0_11; + FLOAT res0_12; + FLOAT res0_13; + FLOAT res0_14; + FLOAT res0_15; + + FLOAT res1_0; + FLOAT res1_1; + FLOAT res1_2; + FLOAT res1_3; + FLOAT res1_4; + FLOAT res1_5; + FLOAT res1_6; + FLOAT res1_7; + + FLOAT res1_8; + FLOAT res1_9; + FLOAT res1_10; + FLOAT res1_11; + FLOAT res1_12; + FLOAT res1_13; + FLOAT res1_14; + FLOAT res1_15; + + FLOAT res2_0; + FLOAT res2_1; + FLOAT res2_2; + FLOAT res2_3; + FLOAT res2_4; + FLOAT res2_5; + FLOAT res2_6; + FLOAT res2_7; + + FLOAT res2_8; + FLOAT res2_9; + FLOAT res2_10; + FLOAT res2_11; + FLOAT res2_12; + FLOAT res2_13; + FLOAT res2_14; + FLOAT res2_15; + + FLOAT res3_0; + FLOAT res3_1; + FLOAT res3_2; + FLOAT res3_3; + FLOAT res3_4; + FLOAT res3_5; + FLOAT res3_6; + FLOAT res3_7; + + FLOAT res3_8; + FLOAT res3_9; + FLOAT res3_10; + FLOAT res3_11; + FLOAT res3_12; + FLOAT res3_13; + FLOAT res3_14; + FLOAT res3_15; + + FLOAT a0; + FLOAT a1; + + FLOAT b0; + FLOAT b1; + FLOAT b2; + FLOAT b3; + + BLASLONG off, temp; + +#if !defined(LEFT) + off = -offset; +#else + off = 0; +#endif + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/amax_vector.c b/kernel/riscv64/amax_vector.c new file mode 100644 index 000000000..b6aec131e --- /dev/null +++ b/kernel/riscv64/amax_vector.c @@ -0,0 +1,245 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + if (n <= 0 || inc_x <= 0) return(maxf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_max; + + MASK_T mask0, mask1; + FLOAT zero = 0.0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_max = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + if(gvl <= n/2){ + BLASLONG inc_xv = inc_x * gvl; + v_max = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + } + return(maxf); +} + + diff --git a/kernel/riscv64/amin.c b/kernel/riscv64/amin.c new file mode 100644 index 000000000..78495a8e3 --- /dev/null +++ b/kernel/riscv64/amin.c @@ -0,0 +1,75 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < minf ) + { + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/amin_vector.c b/kernel/riscv64/amin_vector.c new file mode 100644 index 000000000..53243ad56 --- /dev/null +++ b/kernel/riscv64/amin_vector.c @@ -0,0 +1,241 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT minf=FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_min; + + MASK_T mask0, mask1; + FLOAT zero = 0.0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + for(i=0,j=0; i + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + if (n <= 0 || inc_x <= 0) return(sumf); + + n *= inc_x; + while(i < n) + { + sumf += ABS(x[i]); + i += inc_x; + } + return(sumf); +} + + diff --git a/kernel/riscv64/asum_vector.c b/kernel/riscv64/asum_vector.c new file mode 100644 index 000000000..7ab7484e8 --- /dev/null +++ b/kernel/riscv64/asum_vector.c @@ -0,0 +1,131 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + + MASK_T mask0, mask1; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i + +#define KERNEL8x4_I \ + "addi t1, %[PB], 1*8 \n\t"\ + "addi t2, %[PB], 2*8 \n\t"\ + "addi t3, %[PB], 3*8 \n\t"\ + "fld ft0, (%[PB]) \n\t"\ + "fld ft1, (t1) \n\t"\ + "fld ft2, (t2) \n\t"\ + "fld ft3, (t3) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 2*8 \n\t"\ + "addi t5, %[PA], 4*8 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t6, %[PA], 6*8 \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vfmv.v.f v11, ft3 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "fld ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "fld ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "fld ft6, (t2) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "fld ft7, (t3) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "vfmv.v.f v15, ft7 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t" + +#define KERNEL8x4_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "fld ft4, (%[PB]) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "fld ft5, (t1) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "fld ft6, (t2) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "fld ft7, (t3) \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t"\ + "vfmv.v.f v15, ft7 \n\t" + +#define KERNEL8x4_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*8 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*8 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 8*8 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 8*8 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "fld ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "fld ft1, (t1) \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "fld ft2, (t2) \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "fld ft3, (t3) \n\t"\ + "addi %[PB], %[PB], 4*8 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "addi t1, t1, 4*8 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "addi t2, t2, 4*8 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "addi t3, t3, 4*8 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t"\ + "vfmv.v.f v11, ft3 \n\t" + +#define KERNEL8x4_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t" + + + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + FLOAT *ptrba,*ptrbb; + + FLOAT loadb0,loadb1,loadb2,loadb3; + FLOAT load0,load1,load2,load3,load4,load5,load6,load7; + + FLOAT res0,res1,res2,res3; + FLOAT res4,res5,res6,res7; + FLOAT res8,res9,res10,res11; + FLOAT res12,res13,res14,res15; + + for (j=0; j 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + vy = VLEV_FLOAT(&y[j], gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + }else if(inc_y == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + unsigned int stride_x = inc_x * sizeof(FLOAT); + for(i=0,j=0; i 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); + vy = VLEV_FLOAT(&y[j], gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + }else if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + unsigned int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int stride_y = inc_y * sizeof(FLOAT); + for(i=0,j=0; i 0){ + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDSUM_FLOAT(vr, vx, gvl); + dot += vx[0]; + } + //tail + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[j*inc_x], stride_x, gvl); + vy = VLSEV_FLOAT(&y[j*inc_y], stride_y, gvl); + FLOAT_V_T vz = VFMVVF_FLOAT(0, gvl); + //vr = VFDOTVV_FLOAT(vx, vy, gvl); + vr = VFMACCVV_FLOAT(vz, vx, vy, gvl); + vx = VFREDSUM_FLOAT(vr, vz, gvl); + dot += vx[0]; + } + } + return(dot); +} + + diff --git a/kernel/riscv64/gemv_n.c b/kernel/riscv64/gemv_n.c new file mode 100644 index 000000000..ef61b245b --- /dev/null +++ b/kernel/riscv64/gemv_n.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** + * * 2013/09/14 Saar + * * BLASTEST float : OK + * * BLASTEST double : OK + * CTEST : OK + * TEST : OK + * * + * **************************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG j; + FLOAT *a_ptr; + FLOAT temp; + + ix = 0; + a_ptr = a; + + for (j=0; j + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) > maxf ) + { + max = i; + maxf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/iamax_vector.c b/kernel/riscv64/iamax_vector.c new file mode 100644 index 000000000..3aa64afc9 --- /dev/null +++ b/kernel/riscv64/iamax_vector.c @@ -0,0 +1,191 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT maxf=0.0; + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-1, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-1, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_max = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(0, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + } + return(max_index+1); +} + + diff --git a/kernel/riscv64/iamin.c b/kernel/riscv64/iamin.c new file mode 100644 index 000000000..155292bd5 --- /dev/null +++ b/kernel/riscv64/iamin.c @@ -0,0 +1,77 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=ABS(x[0]); + ix += inc_x; + i++; + + while(i < n) + { + if( ABS(x[ix]) < ABS(minf) ) + { + min = i; + minf = ABS(x[ix]); + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/iamin_vector.c b/kernel/riscv64/iamin_vector.c new file mode 100644 index 000000000..608f19a00 --- /dev/null +++ b/kernel/riscv64/iamin_vector.c @@ -0,0 +1,192 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT minf=FLT_MAX; + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLEV_FLOAT(&x[j], gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + vx = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + //fabs(vector) + mask = VMFLTVF_FLOAT(vx, 0, gvl); + v_min = VFRSUBVF_MASK_FLOAT(vx, vx, 0, mask, gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + } + return(min_index+1); +} + + diff --git a/kernel/riscv64/imax.c b/kernel/riscv64/imax.c new file mode 100644 index 000000000..5072dd16e --- /dev/null +++ b/kernel/riscv64/imax.c @@ -0,0 +1,69 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + BLASLONG max=0; + + if (n <= 0 || inc_x <= 0) return(max); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + max = i; + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/imax_vector.c b/kernel/riscv64/imax_vector.c new file mode 100644 index 000000000..44af7101b --- /dev/null +++ b/kernel/riscv64/imax_vector.c @@ -0,0 +1,176 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + FLOAT maxf=-FLT_MAX; + + FLOAT_V_T vx, v_max; + UINT_V_T v_max_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_max = VLEV_FLOAT(&x[j], gvl); + + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + v_max_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + //index where element greater than v_max + mask = VMFLTVV_FLOAT(v_max, vx, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask, gvl); + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + maxf = vx[0]; + mask = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_max = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + vx = VFMVVF_FLOAT(-FLT_MAX, gvl); + vx = VFREDMAXVS_FLOAT(v_max, vx, gvl); + FLOAT cur_maxf = vx[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask,gvl); + max_index = v_max_index[max_index]; + } + } + } + return(max_index+1); +} + + diff --git a/kernel/riscv64/imin.c b/kernel/riscv64/imin.c new file mode 100644 index 000000000..598cba387 --- /dev/null +++ b/kernel/riscv64/imin.c @@ -0,0 +1,67 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +/************************************************************************************** +* 2013/08/19 Saar +* BLASTEST float +* BLASTEST double +* +**************************************************************************************/ + +#include "common.h" +#include + + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + BLASLONG min=0; + + if (n <= 0 || inc_x <= 0) return(min); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > minf ) + { + min = i; + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/imin_vector.c b/kernel/riscv64/imin_vector.c new file mode 100644 index 000000000..e6e0e9f9f --- /dev/null +++ b/kernel/riscv64/imin_vector.c @@ -0,0 +1,212 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define ABS fabs +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT minf=FLT_MAX; + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx, v_min; + UINT_V_T v_min_index; + MASK_T mask; + unsigned int gvl = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLEV_FLOAT(&x[j], gvl); + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#endif +*/ + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_min = VLEV_FLOAT(&x[j], gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + unsigned int stride_x = inc_x * sizeof(FLOAT); + unsigned int idx = 0, inc_v = gvl * inc_x; + + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + v_min_index = VMVVX_UINT(0, gvl); + for(i=0,j=0; i < n/gvl; i++){ + vx = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + //index where element less than v_min + mask = VMFLTVV_FLOAT(vx, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask), "r"(gvl) + :"v0"); +#endif +*/ + + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx, gvl); + j += gvl; + idx += inc_v; + } + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + minf = vx[0]; + mask = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_min = VLSEV_FLOAT(&x[idx], stride_x, gvl); + + vx = VFMVVF_FLOAT(FLT_MAX, gvl); + vx = VFREDMINVS_FLOAT(v_min, vx, gvl); + FLOAT cur_minf = vx[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + mask = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask,gvl); + min_index = v_min_index[min_index]; + } + } + } + return(min_index+1); +} + + diff --git a/kernel/riscv64/izamax.c b/kernel/riscv64/izamax.c new file mode 100644 index 000000000..8fe33e95b --- /dev/null +++ b/kernel/riscv64/izamax.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG max=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(max); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + max = i; + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(max+1); +} + + diff --git a/kernel/riscv64/izamax_vector.c b/kernel/riscv64/izamax_vector.c new file mode 100644 index 000000000..62c95d973 --- /dev/null +++ b/kernel/riscv64/izamax_vector.c @@ -0,0 +1,246 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define RVV_EFLOAT RVV_E64 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VMFGEVF_FLOAT vmfgevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VMFGEVF_FLOAT vmfgevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + +#define RVV_M RVV_M8 + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT maxf=0.0; + unsigned int max_index = 0; + if (n <= 0 || inc_x <= 0) return(max_index); + + FLOAT_V_T vx0, vx1, v_max; + UINT_V_T v_max_index; + MASK_T mask0, mask1; + unsigned int gvl = 0; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + v_max = VFMVVF_FLOAT(-1, gvl); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = gvl * inc_x * 2; + BLASLONG ix = 0; + for(i=0,j=0; i < n/gvl; i++){ + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + + //index where element greater than v_max + mask0 = VMFLTVV_FLOAT(v_max, vx0, gvl); + v_max_index = VIDV_MASK_UINT(v_max_index, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_max_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_max_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#endif +*/ + v_max_index = VADDVX_MASK_UINT(v_max_index, v_max_index, j, mask0, gvl); + + //update v_max and start_index j + v_max = VFMAXVV_FLOAT(v_max, vx0, gvl); + j += gvl; + ix += inc_xv; + } + vx0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); + maxf = vx0[0]; + mask0 = VMFGEVF_FLOAT(v_max, maxf, gvl); + max_index = VMFIRSTM(mask0,gvl); + max_index = v_max_index[max_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_max_index = VMVVX_UINT(0, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + v_max = VFADDVV_FLOAT(vx0, vx1, gvl); + vx0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDMAXVS_FLOAT(v_max, vx0, gvl); + FLOAT cur_maxf = vx0[0]; + if(cur_maxf > maxf){ + //tail index + v_max_index = VIDV_UINT(gvl); + v_max_index = VADDVX_UINT(v_max_index, j, gvl); + + mask0 = VMFGEVF_FLOAT(v_max, cur_maxf, gvl); + max_index = VMFIRSTM(mask0,gvl); + max_index = v_max_index[max_index]; + } + } + return(max_index+1); +} + + diff --git a/kernel/riscv64/izamin.c b/kernel/riscv64/izamin.c new file mode 100644 index 000000000..fb5a0d4cb --- /dev/null +++ b/kernel/riscv64/izamin.c @@ -0,0 +1,81 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG min=0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(min); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + min = i; + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(min+1); +} + + diff --git a/kernel/riscv64/izamin_vector.c b/kernel/riscv64/izamin_vector.c new file mode 100644 index 000000000..38eccf1b5 --- /dev/null +++ b/kernel/riscv64/izamin_vector.c @@ -0,0 +1,247 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if defined(DOUBLE) + +#define RVV_EFLOAT RVV_E64 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VMFLTVV_FLOAT vmfltvv_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VMFLEVF_FLOAT vmflevf_e64xm8_float64xm8 +#define VMFIRSTM vmfirstm_e64xm8 +#define UINT_V_T uint64xm8_t +#define VIDV_MASK_UINT vidv_mask_uint64xm8 +#define VIDV_UINT vidv_uint64xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint64xm8 +#define VADDVX_UINT vaddvx_uint64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#define VMVVX_UINT vmvvx_uint64xm8 +#else + +#define ABS fabsf +#define RVV_EFLOAT RVV_E32 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VMFLTVV_FLOAT vmfltvv_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VMFLEVF_FLOAT vmflevf_e32xm8_float32xm8 +#define VMFIRSTM vmfirstm_e32xm8 +#define UINT_V_T uint32xm8_t +#define VIDV_MASK_UINT vidv_mask_uint32xm8 +#define VIDV_UINT vidv_uint32xm8 +#define VADDVX_MASK_UINT vaddvx_mask_uint32xm8 +#define VADDVX_UINT vaddvx_uint32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#define VMVVX_UINT vmvvx_uint32xm8 +#endif + +#define RVV_M RVV_M8 + +BLASLONG CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + FLOAT minf=FLT_MAX; + unsigned int min_index = 0; + if (n <= 0 || inc_x <= 0) return(min_index); + + FLOAT_V_T vx0, vx1, v_min; + UINT_V_T v_min_index; + MASK_T mask0, mask1; + unsigned int gvl = 0; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min_index = VMVVX_UINT(0, gvl); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + BLASLONG stride_x = inc_x * 2 * sizeof(FLOAT); + BLASLONG inc_xv = gvl * inc_x * 2; + BLASLONG ix = 0; + for(i=0,j=0; i < n/gvl; i++){ + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx0 = VFADDVV_FLOAT(vx0, vx1, gvl); + + //index where element less than v_min + mask0 = VMFLTVV_FLOAT(vx0, v_min, gvl); + v_min_index = VIDV_MASK_UINT(v_min_index, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e64,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1 \n\t" + "vsetvli x0, %2, e32,m8 \n\t" + "vid.v %0, v0.t \n\t" + :"+v"(v_min_index) + :"v"(mask0), "r"(gvl) + :"v0"); +#endif +*/ + v_min_index = VADDVX_MASK_UINT(v_min_index, v_min_index, j, mask0, gvl); + + //update v_min and start_index j + v_min = VFMINVV_FLOAT(v_min, vx0, gvl); + j += gvl; + ix += inc_xv; + } + vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); + vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); + minf = vx0[0]; + mask0 = VMFLEVF_FLOAT(v_min, minf, gvl); + min_index = VMFIRSTM(mask0,gvl); + min_index = v_min_index[min_index]; + + if(j < n){ + gvl = vsetvli(n-j, RVV_EFLOAT, RVV_M); + v_min_index = VMVVX_UINT(0, gvl); + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + //fabs(vector) + mask0 = VMFLTVF_FLOAT(vx0, 0, gvl); + vx0 = VFRSUBVF_MASK_FLOAT(vx0, vx0, 0, mask0, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx0) + :"v"(mask0), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); + //fabs(vector) + mask1 = VMFLTVF_FLOAT(vx1, 0, gvl); + vx1 = VFRSUBVF_MASK_FLOAT(vx1, vx1, 0, mask1, gvl); +/* +#if defined(DOUBLE) +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e64,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#else +asm volatile( + "vor.vv v0, %1, %1\n\t" + "vsetvli x0, %3, e32,m8 \n\t" + "vfrsub.vf %0, %0, %2, v0.t \n\t" + :"+v"(vx1) + :"v"(mask1), "f"(zero), "r"(gvl) + :"v0"); +#endif +*/ + v_min = VFADDVV_FLOAT(vx0, vx1, gvl); + vx0 = VFMVVF_FLOAT(FLT_MAX, gvl); + vx0 = VFREDMINVS_FLOAT(v_min, vx0, gvl); + FLOAT cur_minf = vx0[0]; + if(cur_minf < minf){ + //tail index + v_min_index = VIDV_UINT(gvl); + v_min_index = VADDVX_UINT(v_min_index, j, gvl); + + mask0 = VMFLEVF_FLOAT(v_min, cur_minf, gvl); + min_index = VMFIRSTM(mask0,gvl); + min_index = v_min_index[min_index]; + } + } + return(min_index+1); +} + + diff --git a/kernel/riscv64/max.c b/kernel/riscv64/max.c new file mode 100644 index 000000000..2ad956bc0 --- /dev/null +++ b/kernel/riscv64/max.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + + if (n <= 0 || inc_x <= 0) return(maxf); + + maxf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] > maxf ) + { + maxf = x[ix]; + } + ix += inc_x; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/max_vector.c b/kernel/riscv64/max_vector.c new file mode 100644 index 000000000..4ef75452d --- /dev/null +++ b/kernel/riscv64/max_vector.c @@ -0,0 +1,116 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT maxf=-FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_max; + + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + }else{ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + BLASLONG stride_x = inc_x * sizeof(FLOAT); + if(gvl <= n/2){ + v_max = VFMVVF_FLOAT(-FLT_MAX, gvl); + BLASLONG idx = 0, inc_xv = inc_x * gvl; + for(i=0,j=0; i maxf) + maxf = v0[0]; + j += gvl; + } + } + return(maxf); +} + + diff --git a/kernel/riscv64/min.c b/kernel/riscv64/min.c new file mode 100644 index 000000000..2812fe397 --- /dev/null +++ b/kernel/riscv64/min.c @@ -0,0 +1,65 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : NoTest +* BLASTEST double : NoTest +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf=0.0; + + if (n <= 0 || inc_x <= 0) return(minf); + + minf=x[0]; + ix += inc_x; + i++; + + while(i < n) + { + if( x[ix] < minf ) + { + minf = x[ix]; + } + ix += inc_x; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/min_vector.c b/kernel/riscv64/min_vector.c new file mode 100644 index 000000000..83c965bfa --- /dev/null +++ b/kernel/riscv64/min_vector.c @@ -0,0 +1,116 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT minf=FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_min; + + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + for(i=0,j=0; i + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + FLOAT absxi = 0.0; + + + if (n <= 0 || inc_x <= 0) return(0.0); + if ( n == 1 ) return( ABS(x[0]) ); + + n *= inc_x; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + absxi = ABS( x[i] ); + if ( scale < absxi ) + { + ssq = 1 + ssq * ( scale / absxi ) * ( scale / absxi ); + scale = absxi ; + } + else + { + ssq += ( absxi/scale ) * ( absxi/scale ); + } + + } + i += inc_x; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/riscv64/nrm2_vector.c b/kernel/riscv64/nrm2_vector.c new file mode 100644 index 000000000..785c0d2f8 --- /dev/null +++ b/kernel/riscv64/nrm2_vector.c @@ -0,0 +1,220 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M4 +#define FLOAT_V_T float32xm4_t +#define VLEV_FLOAT vlev_float32xm4 +#define VLSEV_FLOAT vlsev_float32xm4 +#define VFREDSUM_FLOAT vfredsumvs_float32xm4 +#define VFMACCVV_FLOAT vfmaccvv_float32xm4 +#define VFMVVF_FLOAT vfmvvf_float32xm4 +#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define ABS fabsf +#define MASK_T e32xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 +#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 +#define VMFIRSTM vmfirstm_e32xm4 +#define VFDIVVF_FLOAT vfdivvf_float32xm4 +#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M4 +#define FLOAT_V_T float64xm4_t +#define VLEV_FLOAT vlev_float64xm4 +#define VLSEV_FLOAT vlsev_float64xm4 +#define VFREDSUM_FLOAT vfredsumvs_float64xm4 +#define VFMACCVV_FLOAT vfmaccvv_float64xm4 +#define VFMVVF_FLOAT vfmvvf_float64xm4 +#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define ABS fabs +#define MASK_T e64xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 +#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 +#define VMFIRSTM vmfirstm_e64xm4 +#define VFDIVVF_FLOAT vfdivvf_float64xm4 +#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + + if ( n < 0 ) return(0.0); + if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; + if(inc_x == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i + +#define KERNEL16x4_I \ + "addi t1, %[PB], 1*4 \n\t"\ + "addi t2, %[PB], 2*4 \n\t"\ + "addi t3, %[PB], 3*4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "flw ft1, (t1) \n\t"\ + "flw ft2, (t2) \n\t"\ + "flw ft3, (t3) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 4*4 \n\t"\ + "addi t5, %[PA], 8*4 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t6, %[PA], 12*4 \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmv.v.f v11, ft3 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "flw ft7, (t3) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "vfmv.v.f v15, ft7 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t" + +#define KERNEL16x4_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "flw ft7, (t3) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v26, v10, v2 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v27, v10, v3 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmacc.vv v30, v11, v2 \n\t"\ + "vfmacc.vv v31, v11, v3 \n\t"\ + "vfmv.v.f v15, ft7 \n\t" + +#define KERNEL16x4_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "flw ft1, (t1) \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "flw ft2, (t2) \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "flw ft3, (t3) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t"\ + "vfmv.v.f v11, ft3 \n\t" + +#define KERNEL16x4_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmacc.vv v26, v14, v6 \n\t"\ + "vfmacc.vv v27, v14, v7 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmacc.vv v30, v15, v6 \n\t"\ + "vfmacc.vv v31, v15, v7 \n\t" + + +#define KERNEL8x4_I \ + "addi t1, %[PB], 1*4 \n\t"\ + "addi t2, %[PB], 2*4 \n\t"\ + "addi t3, %[PB], 3*4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "flw ft1, (t1) \n\t"\ + "flw ft2, (t2) \n\t"\ + "flw ft3, (t3) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 4*4 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vfmv.v.f v11, ft3 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "flw ft7, (t3) \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "vfmv.v.f v15, ft7 \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "addi t3, t3, 4*4 \n\t" + + +#define KERNEL8x4_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "flw ft5, (t1) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "flw ft6, (t2) \n\t"\ + "vfmacc.vv v24, v10, v0 \n\t"\ + "flw ft7, (t3) \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v10, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmv.v.f v13, ft5 \n\t"\ + "vfmacc.vv v28, v11, v0 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v11, v1 \n\t"\ + "vfmv.v.f v14, ft6 \n\t"\ + "vfmv.v.f v15, ft7 \n\t" + +#define KERNEL8x4_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 8*4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 8*4 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "flw ft1, (t1) \n\t"\ + "addi %[PB], %[PB], 4*4 \n\t"\ + "flw ft2, (t2) \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "flw ft3, (t3) \n\t"\ + "addi t1, t1, 4*4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t2, t2, 4*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "addi t3, t3, 4*4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t"\ + "vfmv.v.f v10, ft2 \n\t"\ + "vfmv.v.f v11, ft3 \n\t" + +#define KERNEL8x4_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v24, v14, v4 \n\t"\ + "vfmacc.vv v25, v14, v5 \n\t"\ + "vfmacc.vv v28, v15, v4 \n\t"\ + "vfmacc.vv v29, v15, v5 \n\t" + + +#define KERNEL16x2_I \ + "addi t1, %[PB], 1*4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "flw ft1, (t1) \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi t4, %[PA], 4*4 \n\t"\ + "addi t5, %[PA], 8*4 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "addi t6, %[PA], 12*4 \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v16, v8, v0 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmv.v.f v13, ft5 \n\t" + + +#define KERNEL16x2_M1 \ + "vfmacc.vv v16, v8, v0 \n\t"\ + "vle.v v4, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v8, v1 \n\t"\ + "vle.v v5, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v8, v2 \n\t"\ + "vle.v v6, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v8, v3 \n\t"\ + "vle.v v7, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "flw ft4, (%[PB]) \n\t"\ + "vfmacc.vv v20, v9, v0 \n\t"\ + "flw ft5, (t1) \n\t"\ + "vfmacc.vv v21, v9, v1 \n\t"\ + "vfmv.v.f v12, ft4 \n\t"\ + "vfmacc.vv v22, v9, v2 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vfmacc.vv v23, v9, v3 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "vfmv.v.f v13, ft5 \n\t" + + +#define KERNEL16x2_M2 \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vle.v v0, (%[PA]) \n\t"\ + "addi %[PA], %[PA], 16*4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vle.v v1, (t4) \n\t"\ + "addi t4, t4, 16*4 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vle.v v2, (t5) \n\t"\ + "addi t5, t5, 16*4 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vle.v v3, (t6) \n\t"\ + "addi t6, t6, 16*4 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "flw ft0, (%[PB]) \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "flw ft1, (t1) \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmv.v.f v8, ft0 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t"\ + "addi %[PB], %[PB], 2*4 \n\t"\ + "addi t1, t1, 2*4 \n\t"\ + "vfmv.v.f v9, ft1 \n\t" + + +#define KERNEL16x2_E \ + "vfmacc.vv v16, v12, v4 \n\t"\ + "vfmacc.vv v17, v12, v5 \n\t"\ + "vfmacc.vv v18, v12, v6 \n\t"\ + "vfmacc.vv v19, v12, v7 \n\t"\ + "vfmacc.vv v20, v13, v4 \n\t"\ + "vfmacc.vv v21, v13, v5 \n\t"\ + "vfmacc.vv v22, v13, v6 \n\t"\ + "vfmacc.vv v23, v13, v7 \n\t" + + +int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc +#ifdef TRMMKERNEL + ,BLASLONG offset +#endif + ) +{ + BLASLONG i,j,k; + FLOAT *C0,*C1,*C2,*C3; + FLOAT *ptrba,*ptrbb; + + FLOAT loadb0,loadb1,loadb2,loadb3; + FLOAT load0,load1,load2,load3,load4,load5,load6,load7; + + FLOAT res0,res1,res2,res3; + FLOAT res4,res5,res6,res7; + FLOAT res8,res9,res10,res11; + FLOAT res12,res13,res14,res15; + + for (j=0; j + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp; + + if ( n < 0 ) return(0); + + while(i < n) + { + + temp = x[ix] ; + x[ix] = y[iy] ; + y[iy] = temp ; + + ix += inc_x ; + iy += inc_y ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/swap_vector.c b/kernel/riscv64/swap_vector.c new file mode 100644 index 000000000..9377bf4b9 --- /dev/null +++ b/kernel/riscv64/swap_vector.c @@ -0,0 +1,173 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VSEV_FLOAT vsev_float32xm8 +#define VSSEV_FLOAT vssev_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VSEV_FLOAT vsev_float64xm8 +#define VSSEV_FLOAT vssev_float64xm8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + BLASLONG stride_x, stride_y; + FLOAT_V_T vx0, vx1, vy0, vy1; + unsigned int gvl = 0; + + if (n < 0) return(0); + if(inc_x == 1 && inc_y == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + if(gvl <= n/2){ + for(i=0,j=0; i 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLEV_FLOAT(&y[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += alpha * temp2; + a_ptr += lda; + } + }else if(inc_x == 1){ + jy = 0; + stride_y = inc_y * sizeof(FLOAT); + for (j=0; j 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += alpha * temp2; + jy += inc_y; + a_ptr += lda; + } + }else if(inc_y == 1){ + jx = 0; + stride_x = inc_x * sizeof(FLOAT); + for (j=0; j 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + inc_xv = inc_x * gvl; + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLEV_FLOAT(&y[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += alpha * temp2; + jx += inc_x; + a_ptr += lda; + } + }else{ + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + jx = 0; + jy = 0; + for (j=0; j 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += alpha * temp2; + jx += inc_x; + jy += inc_y; + a_ptr += lda; + } + } + return(0); +} + diff --git a/kernel/riscv64/symv_U.c b/kernel/riscv64/symv_U.c new file mode 100644 index 000000000..b5a0c96e9 --- /dev/null +++ b/kernel/riscv64/symv_U.c @@ -0,0 +1,71 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + + +#include "common.h" + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) +{ + BLASLONG i; + BLASLONG ix,iy; + BLASLONG jx,jy; + BLASLONG j; + FLOAT temp1; + FLOAT temp2; + +#if 0 + if( m != offset ) + printf("Symv_U: m=%d offset=%d\n",m,offset); +#endif + + BLASLONG m1 = m - offset; + + jx = m1 * inc_x; + jy = m1 * inc_y; + + for (j=m1; j 0){ + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + } + }else if(inc_x == 1){ + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + iy = 0; + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLEV_FLOAT(&x[i], gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + jy += inc_y; + } + }else if(inc_y == 1){ + jx = m1 * inc_x; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_xv = inc_x * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLEV_FLOAT(&y[i], gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSEV_FLOAT(&y[i], vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[j] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + jx += inc_x; + } + }else{ + jx = m1 * inc_x; + jy = m1 * inc_y; + a_ptr += m1 * lda; + stride_x = inc_x * sizeof(FLOAT); + stride_y = inc_y * sizeof(FLOAT); + for (j=m1; j 0){ + ix = 0; + iy = 0; + i = 0; + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_xv = inc_x * gvl; + inc_yv = inc_y * gvl; + vr = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMACCVV_FLOAT(vr, vx, va, gvl); + + i += gvl; + ix += inc_xv; + iy += inc_yv; + } + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 = va[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + vy = VLSEV_FLOAT(&y[iy], stride_y, gvl); + va = VLEV_FLOAT(&a_ptr[i], gvl); + vy = VFMACCVF_FLOAT(vy, temp1, va, gvl); + VSSEV_FLOAT(&y[iy], stride_y, vy, gvl); + + vx = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vr = VFMULVV_FLOAT(vx, va, gvl); + va = VFMVVF_FLOAT(0, gvl); + va = VFREDSUM_FLOAT(vr, va, gvl); + temp2 += va[0]; + } + } + y[jy] += temp1 * a_ptr[j] + alpha * temp2; + a_ptr += lda; + jx += inc_x; + jy += inc_y; + } + } + return(0); +} + diff --git a/kernel/riscv64/zamax.c b/kernel/riscv64/zamax.c new file mode 100644 index 000000000..a39bd7821 --- /dev/null +++ b/kernel/riscv64/zamax.c @@ -0,0 +1,79 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT maxf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + maxf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) > maxf ) + { + maxf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(maxf); +} + + diff --git a/kernel/riscv64/zamax_vector.c b/kernel/riscv64/zamax_vector.c new file mode 100644 index 000000000..a6c742b14 --- /dev/null +++ b/kernel/riscv64/zamax_vector.c @@ -0,0 +1,104 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMAXVV_FLOAT vfmaxvv_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMAXVV_FLOAT vfmaxvv_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT maxf=0.0; + if (n <= 0 || inc_x <= 0) return(maxf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_max; + + MASK_T mask0, mask1; + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_max = VFMVVF_FLOAT(0, gvl); + BLASLONG inc_xv = inc_x * gvl * 2; + for(; i maxf) + maxf = v_max[0]; + } + return(maxf); +} diff --git a/kernel/riscv64/zamin.c b/kernel/riscv64/zamin.c new file mode 100644 index 000000000..02eab3e75 --- /dev/null +++ b/kernel/riscv64/zamin.c @@ -0,0 +1,79 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/14 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : NoTest +* TEST : NoTest +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + BLASLONG ix=0; + FLOAT minf; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + minf = CABS1(x,0); + ix += inc_x2; + i++; + + while(i < n) + { + if( CABS1(x,ix) < minf ) + { + minf = CABS1(x,ix); + } + ix += inc_x2; + i++; + } + return(minf); +} + + diff --git a/kernel/riscv64/zamin_vector.c b/kernel/riscv64/zamin_vector.c new file mode 100644 index 000000000..44a7cf1dc --- /dev/null +++ b/kernel/riscv64/zamin_vector.c @@ -0,0 +1,104 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDMINVS_FLOAT vfredminvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFMINVV_FLOAT vfminvv_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDMINVS_FLOAT vfredminvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFMINVV_FLOAT vfminvv_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + if (n <= 0 || inc_x <= 0) return(0.0); + FLOAT minf=FLT_MAX; + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_min; + MASK_T mask0, mask1; + BLASLONG stride_x = inc_x * sizeof(FLOAT) * 2; + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + v_min = VFMVVF_FLOAT(FLT_MAX, gvl); + BLASLONG inc_xv = inc_x * gvl * 2; + for(; i + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + +#define CABS1(x,i) ABS(x[i])+ABS(x[i+1]) + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT sumf = 0.0; + BLASLONG inc_x2; + + if (n <= 0 || inc_x <= 0) return(sumf); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + sumf += CABS1(x,i); + i += inc_x2; + } + return(sumf); +} + + diff --git a/kernel/riscv64/zasum_vector.c b/kernel/riscv64/zasum_vector.c new file mode 100644 index 000000000..d9fa88971 --- /dev/null +++ b/kernel/riscv64/zasum_vector.c @@ -0,0 +1,136 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include + +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float32xm8 +#define MASK_T e32xm8_t +#define VMFLTVF_FLOAT vmfltvf_e32xm8_float32xm8 +#define VFMVVF_FLOAT vfmvvf_float32xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm8 +#define VFADDVV_FLOAT vfaddvv_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VFREDSUMVS_FLOAT vfredsumvs_float64xm8 +#define MASK_T e64xm8_t +#define VMFLTVF_FLOAT vmfltvf_e64xm8_float64xm8 +#define VFMVVF_FLOAT vfmvvf_float64xm8 +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm8 +#define VFADDVV_FLOAT vfaddvv_float64xm8 +#endif +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + BLASLONG ix=0; + FLOAT asumf=0.0; + if (n <= 0 || inc_x <= 0) return(asumf); + unsigned int gvl = 0; + FLOAT_V_T v0, v1, v_zero,v_sum; + + MASK_T mask0, mask1; + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + v_zero = VFMVVF_FLOAT(0, gvl); + if(gvl <= n2/2){ + v_sum = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i 0){ + gvl = vsetvli(len, RVV_EFLOAT, RVV_M); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < len / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 = vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 = vx1[0]; + if(i < m){ + gvl = vsetvli(m-i, RVV_EFLOAT, RVV_M); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 += vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 += vx1[0]; + } + } + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/zhemv_UV_vector.c b/kernel/riscv64/zhemv_UV_vector.c new file mode 100644 index 000000000..6fe12c76c --- /dev/null +++ b/kernel/riscv64/zhemv_UV_vector.c @@ -0,0 +1,192 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M4 +#define FLOAT_V_T float32xm4_t +#define VLSEV_FLOAT vlsev_float32xm4 +#define VSSEV_FLOAT vssev_float32xm4 +#define VFREDSUM_FLOAT vfredsumvs_float32xm4 +#define VFMACCVV_FLOAT vfmaccvv_float32xm4 +#define VFMACCVF_FLOAT vfmaccvf_float32xm4 +#define VFMVVF_FLOAT vfmvvf_float32xm4 +#define VFMULVV_FLOAT vfmulvv_float32xm4 +#define VFNMSACVF_FLOAT vfnmsacvf_float32xm4 +#define VFNMSACVV_FLOAT vfnmsacvv_float32xm4 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M4 +#define FLOAT_V_T float64xm4_t +#define VLSEV_FLOAT vlsev_float64xm4 +#define VSSEV_FLOAT vssev_float64xm4 +#define VFREDSUM_FLOAT vfredsumvs_float64xm4 +#define VFMACCVV_FLOAT vfmaccvv_float64xm4 +#define VFMACCVF_FLOAT vfmaccvf_float64xm4 +#define VFMVVF_FLOAT vfmvvf_float64xm4 +#define VFMULVV_FLOAT vfmulvv_float64xm4 +#define VFNMSACVF_FLOAT vfnmsacvf_float64xm4 +#define VFNMSACVV_FLOAT vfnmsacvv_float64xm4 +#endif + +int CNAME(BLASLONG m, BLASLONG offset, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG incx, FLOAT *y, BLASLONG incy, FLOAT *buffer){ + BLASLONG i, j, k; + BLASLONG ix, iy, ia; + BLASLONG jx, jy, ja; + FLOAT temp_r1, temp_i1; + FLOAT temp_r2, temp_i2; + FLOAT *a_ptr = a; + unsigned int gvl = 0; + + + FLOAT_V_T va0, va1, vx0, vx1, vy0, vy1, vr0, vr1; + BLASLONG stride_x, stride_y, stride_a, inc_xv, inc_yv, inc_av, lda2; + + BLASLONG inc_x2 = incx * 2; + BLASLONG inc_y2 = incy * 2; + stride_x = inc_x2 * sizeof(FLOAT); + stride_y = inc_y2 * sizeof(FLOAT); + stride_a = 2 * sizeof(FLOAT); + lda2 = lda * 2; + + BLASLONG m1 = m - offset; + a_ptr = a + m1 * lda2; + jx = m1 * inc_x2; + jy = m1 * inc_y2; + ja = m1 * 2; + for(j = m1; j < m; j++){ + temp_r1 = alpha_r * x[jx] - alpha_i * x[jx+1];; + temp_i1 = alpha_r * x[jx+1] + alpha_i * x[jx]; + temp_r2 = 0; + temp_i2 = 0; + ix = 0; + iy = 0; + ia = 0; + i = 0; + if(j > 0){ + gvl = vsetvli(j, RVV_EFLOAT, RVV_M); + inc_xv = incx * gvl * 2; + inc_yv = incy * gvl * 2; + inc_av = gvl * 2; + vr0 = VFMVVF_FLOAT(0, gvl); + vr1 = VFMVVF_FLOAT(0, gvl); + for(k = 0; k < j / gvl; k++){ + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMACCVV_FLOAT(vr0, vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); + +#endif + i += gvl; + ix += inc_xv; + iy += inc_yv; + ia += inc_av; + } + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 = vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 = vx1[0]; + if(i < j){ + gvl = vsetvli(j-i, RVV_EFLOAT, RVV_M); + va0 = VLSEV_FLOAT(&a_ptr[ia], stride_a, gvl); + va1 = VLSEV_FLOAT(&a_ptr[ia+1], stride_a, gvl); + vy0 = VLSEV_FLOAT(&y[iy], stride_y, gvl); + vy1 = VLSEV_FLOAT(&y[iy+1], stride_y, gvl); +#ifndef HEMVREV + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFNMSACVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#else + vy0 = VFMACCVF_FLOAT(vy0, temp_r1, va0, gvl); + vy0 = VFMACCVF_FLOAT(vy0, temp_i1, va1, gvl); + vy1 = VFNMSACVF_FLOAT(vy1, temp_r1, va1, gvl); + vy1 = VFMACCVF_FLOAT(vy1, temp_i1, va0, gvl); +#endif + VSSEV_FLOAT(&y[iy], stride_y, vy0, gvl); + VSSEV_FLOAT(&y[iy+1], stride_y, vy1, gvl); + + vx0 = VLSEV_FLOAT(&x[ix], stride_x, gvl); + vx1 = VLSEV_FLOAT(&x[ix+1], stride_x, gvl); +#ifndef HEMVREV + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFMACCVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFNMSACVV_FLOAT(vr1, vx0, va1, gvl); +#else + vr0 = VFMULVV_FLOAT(vx0, va0, gvl); + vr0 = VFNMSACVV_FLOAT(vr0, vx1, va1, gvl); + vr1 = VFMULVV_FLOAT(vx1, va0, gvl); + vr1 = VFMACCVV_FLOAT(vr1, vx0, va1, gvl); +#endif + + va0 = VFMVVF_FLOAT(0, gvl); + vx0 = VFREDSUM_FLOAT(vr0, va0, gvl); + temp_r2 += vx0[0]; + vx1 = VFREDSUM_FLOAT(vr1, va0, gvl); + temp_i2 += vx1[0]; + } + } + y[jy] += temp_r1 * a_ptr[ja]; + y[jy+1] += temp_i1 * a_ptr[ja]; + y[jy] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[jy+1] += alpha_r * temp_i2 + alpha_i * temp_r2; + jx += inc_x2; + jy += inc_y2; + ja += 2; + a_ptr += lda2; + } + return(0); +} diff --git a/kernel/riscv64/znrm2.c b/kernel/riscv64/znrm2.c new file mode 100644 index 000000000..fc1c8b54a --- /dev/null +++ b/kernel/riscv64/znrm2.c @@ -0,0 +1,106 @@ +/*************************************************************************** +Copyright (c) 2013, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +/************************************************************************************** +* 2013/09/13 Saar +* BLASTEST float : OK +* BLASTEST double : OK +* CTEST : OK +* TEST : OK +* +**************************************************************************************/ + +#include "common.h" +#include + +#if defined(DOUBLE) + +#define ABS fabs + +#else + +#define ABS fabsf + +#endif + + + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0; + FLOAT scale = 0.0; + FLOAT ssq = 1.0; + BLASLONG inc_x2; + FLOAT temp; + + if (n <= 0 || inc_x <= 0) return(0.0); + + inc_x2 = 2 * inc_x; + + n *= inc_x2; + while(i < n) + { + + if ( x[i] != 0.0 ) + { + temp = ABS( x[i] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + if ( x[i+1] != 0.0 ) + { + temp = ABS( x[i+1] ); + if ( scale < temp ) + { + ssq = 1 + ssq * ( scale / temp ) * ( scale / temp ); + scale = temp ; + } + else + { + ssq += ( temp / scale ) * ( temp / scale ); + } + + } + + + i += inc_x2; + } + scale = scale * sqrt( ssq ); + return(scale); + +} + + diff --git a/kernel/riscv64/znrm2_vector.c b/kernel/riscv64/znrm2_vector.c new file mode 100644 index 000000000..b0ebfa5f4 --- /dev/null +++ b/kernel/riscv64/znrm2_vector.c @@ -0,0 +1,278 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M4 +#define FLOAT_V_T float32xm4_t +#define VLEV_FLOAT vlev_float32xm4 +#define VLSEV_FLOAT vlsev_float32xm4 +#define VFREDSUM_FLOAT vfredsumvs_float32xm4 +#define VFMACCVV_FLOAT vfmaccvv_float32xm4 +#define VFMVVF_FLOAT vfmvvf_float32xm4 +#define VFDOTVV_FLOAT vfdotvv_float32xm4 +#define ABS fabsf +#define MASK_T e32xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float32xm4 +#define VMFGTVF_FLOAT vmfgtvf_e32xm4_float32xm4 +#define VMFIRSTM vmfirstm_e32xm4 +#define VFDIVVF_FLOAT vfdivvf_float32xm4 +#define VMFLTVF_FLOAT vmfltvf_e32xm4_float32xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float32xm4 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M4 +#define FLOAT_V_T float64xm4_t +#define VLEV_FLOAT vlev_float64xm4 +#define VLSEV_FLOAT vlsev_float64xm4 +#define VFREDSUM_FLOAT vfredsumvs_float64xm4 +#define VFMACCVV_FLOAT vfmaccvv_float64xm4 +#define VFMVVF_FLOAT vfmvvf_float64xm4 +#define VFDOTVV_FLOAT vfdotvv_float64xm4 +#define ABS fabs +#define MASK_T e64xm4_t +#define VFRSUBVF_MASK_FLOAT vfrsubvf_mask_float64xm4 +#define VMFGTVF_FLOAT vmfgtvf_e64xm4_float64xm4 +#define VMFIRSTM vmfirstm_e64xm4 +#define VFDIVVF_FLOAT vfdivvf_float64xm4 +#define VMFLTVF_FLOAT vmfltvf_e64xm4_float64xm4 +#define VFREDMAXVS_FLOAT vfredmaxvs_float64xm4 +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ + BLASLONG i=0, j=0; + + if ( n < 0 ) return(0.0); +// if(n == 1) return (ABS(x[0])); + + FLOAT_V_T vr, v0, v_zero; + unsigned int gvl = 0; + FLOAT scale = 0.0, ssq = 0.0; + MASK_T mask; + BLASLONG index = 0; + if(inc_x == 1){ + BLASLONG n2 = n * 2; + gvl = vsetvli(n2, RVV_EFLOAT, RVV_M); + vr = VFMVVF_FLOAT(0, gvl); + v_zero = VFMVVF_FLOAT(0, gvl); + for(i=0,j=0; i + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i=0; + BLASLONG ix=0,iy=0; + FLOAT temp[2]; + BLASLONG inc_x2; + BLASLONG inc_y2; + + if ( n < 0 ) return(0); + + inc_x2 = 2 * inc_x; + inc_y2 = 2 * inc_y; + + while(i < n) + { + + temp[0] = x[ix] ; + temp[1] = x[ix+1] ; + x[ix] = y[iy] ; + x[ix+1] = y[iy+1] ; + y[iy] = temp[0] ; + y[iy+1] = temp[1] ; + + ix += inc_x2 ; + iy += inc_y2 ; + i++ ; + + } + return(0); + +} + + diff --git a/kernel/riscv64/zswap_vector.c b/kernel/riscv64/zswap_vector.c new file mode 100644 index 000000000..b655a968c --- /dev/null +++ b/kernel/riscv64/zswap_vector.c @@ -0,0 +1,117 @@ +/*************************************************************************** +Copyright (c) 2020, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*****************************************************************************/ + +#include "common.h" +#include +#if !defined(DOUBLE) +#define RVV_EFLOAT RVV_E32 +#define RVV_M RVV_M8 +#define FLOAT_V_T float32xm8_t +#define VLEV_FLOAT vlev_float32xm8 +#define VLSEV_FLOAT vlsev_float32xm8 +#define VSEV_FLOAT vsev_float32xm8 +#define VSSEV_FLOAT vssev_float32xm8 +#else +#define RVV_EFLOAT RVV_E64 +#define RVV_M RVV_M8 +#define FLOAT_V_T float64xm8_t +#define VLEV_FLOAT vlev_float64xm8 +#define VLSEV_FLOAT vlsev_float64xm8 +#define VSEV_FLOAT vsev_float64xm8 +#define VSSEV_FLOAT vssev_float64xm8 +#endif + +int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT dummy4, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2) +{ + BLASLONG i = 0, j = 0; + BLASLONG ix = 0,iy = 0; + BLASLONG stride_x, stride_y; + FLOAT_V_T vx0, vx1, vy0, vy1; + unsigned int gvl = 0; + + if (n < 0) return(0); + if(inc_x == 1 && inc_y == 1){ + gvl = vsetvli(n, RVV_EFLOAT, RVV_M); + BLASLONG n2 = n * 2; + if(gvl <= n2/2){ + for(i=0,j=0; i #endif diff --git a/kernel/simd/intrin_avx.h b/kernel/simd/intrin_avx.h index 3f79646e0..fbe531417 100644 --- a/kernel/simd/intrin_avx.h +++ b/kernel/simd/intrin_avx.h @@ -12,6 +12,8 @@ typedef __m256d v_f64; ***************************/ #define v_add_f32 _mm256_add_ps #define v_add_f64 _mm256_add_pd +#define v_sub_f32 _mm256_sub_ps +#define v_sub_f64 _mm256_sub_pd #define v_mul_f32 _mm256_mul_ps #define v_mul_f64 _mm256_mul_pd @@ -19,12 +21,20 @@ typedef __m256d v_f64; // multiply and add, a*b + c #define v_muladd_f32 _mm256_fmadd_ps #define v_muladd_f64 _mm256_fmadd_pd + // multiply and subtract, a*b - c + #define v_mulsub_f32 _mm256_fmsub_ps + #define v_mulsub_f64 _mm256_fmsub_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return v_add_f64(v_mul_f64(a, b), c); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_sub_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_sub_f64(v_mul_f64(a, b), c); } #endif // !HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. diff --git a/kernel/simd/intrin_avx512.h b/kernel/simd/intrin_avx512.h index f00af53e9..8f38eedd9 100644 --- a/kernel/simd/intrin_avx512.h +++ b/kernel/simd/intrin_avx512.h @@ -12,11 +12,16 @@ typedef __m512d v_f64; ***************************/ #define v_add_f32 _mm512_add_ps #define v_add_f64 _mm512_add_pd +#define v_sub_f32 _mm512_sub_ps +#define v_sub_f64 _mm512_sub_pd #define v_mul_f32 _mm512_mul_ps #define v_mul_f64 _mm512_mul_pd // multiply and add, a*b + c #define v_muladd_f32 _mm512_fmadd_ps #define v_muladd_f64 _mm512_fmadd_pd +// multiply and subtract, a*b - c +#define v_mulsub_f32 _mm512_fmsub_ps +#define v_mulsub_f64 _mm512_fmsub_pd BLAS_FINLINE float v_sum_f32(v_f32 a) { __m512 h64 = _mm512_shuffle_f32x4(a, a, _MM_SHUFFLE(3, 2, 3, 2)); diff --git a/kernel/simd/intrin_neon.h b/kernel/simd/intrin_neon.h index 22cef10ca..cd44599fe 100644 --- a/kernel/simd/intrin_neon.h +++ b/kernel/simd/intrin_neon.h @@ -18,6 +18,8 @@ typedef float32x4_t v_f32; ***************************/ #define v_add_f32 vaddq_f32 #define v_add_f64 vaddq_f64 +#define v_sub_f32 vsubq_f32 +#define v_sub_f64 vsubq_f64 #define v_mul_f32 vmulq_f32 #define v_mul_f64 vmulq_f64 @@ -26,16 +28,24 @@ typedef float32x4_t v_f32; // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return vfmaq_f32(c, a, b); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return vfmaq_f32(vnegq_f32(c), a, b); } #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return vmlaq_f32(c, a, b); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return vmlaq_f32(vnegq_f32(c), a, b); } #endif // FUSED F64 #if V_SIMD_F64 BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return vfmaq_f64(c, a, b); } + BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c) + { return vfmaq_f64(vnegq_f64(c), a, b); } #endif // Horizontal add: Calculates the sum of all vector elements. diff --git a/kernel/simd/intrin_sse.h b/kernel/simd/intrin_sse.h index 06a3fe78b..6a542072e 100644 --- a/kernel/simd/intrin_sse.h +++ b/kernel/simd/intrin_sse.h @@ -12,22 +12,35 @@ typedef __m128d v_f64; ***************************/ #define v_add_f32 _mm_add_ps #define v_add_f64 _mm_add_pd +#define v_sub_f32 _mm_sub_ps +#define v_sub_f64 _mm_sub_pd #define v_mul_f32 _mm_mul_ps #define v_mul_f64 _mm_mul_pd #ifdef HAVE_FMA3 // multiply and add, a*b + c #define v_muladd_f32 _mm_fmadd_ps #define v_muladd_f64 _mm_fmadd_pd + // multiply and subtract, a*b - c + #define v_mulsub_f32 _mm_fmsub_ps + #define v_mulsub_f64 _mm_fmsub_pd #elif defined(HAVE_FMA4) // multiply and add, a*b + c #define v_muladd_f32 _mm_macc_ps #define v_muladd_f64 _mm_macc_pd + // multiply and subtract, a*b - c + #define v_mulsub_f32 _mm_msub_ps + #define v_mulsub_f64 _mm_msub_pd #else // multiply and add, a*b + c BLAS_FINLINE v_f32 v_muladd_f32(v_f32 a, v_f32 b, v_f32 c) { return v_add_f32(v_mul_f32(a, b), c); } BLAS_FINLINE v_f64 v_muladd_f64(v_f64 a, v_f64 b, v_f64 c) { return v_add_f64(v_mul_f64(a, b), c); } + // multiply and subtract, a*b - c + BLAS_FINLINE v_f32 v_mulsub_f32(v_f32 a, v_f32 b, v_f32 c) + { return v_sub_f32(v_mul_f32(a, b), c); } + BLAS_FINLINE v_f64 v_mulsub_f64(v_f64 a, v_f64 b, v_f64 c) + { return v_sub_f64(v_mul_f64(a, b), c); } #endif // HAVE_FMA3 // Horizontal add: Calculates the sum of all vector elements. diff --git a/kernel/x86_64/dasum.c b/kernel/x86_64/dasum.c index 8a40ea4b9..ddec21383 100644 --- a/kernel/x86_64/dasum.c +++ b/kernel/x86_64/dasum.c @@ -58,21 +58,19 @@ static FLOAT dasum_kernel(BLASLONG n, FLOAT *x1) } #endif - -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +static FLOAT asum_compute(BLASLONG n, FLOAT *x, BLASLONG inc_x) { - BLASLONG i=0; + BLASLONG i = 0; FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return (sumf); - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) { + if (inc_x == 1) { sumf = dasum_kernel(n, x); - } + } else { n *= inc_x; - - while(i < n) { + while (i < n) { sumf += ABS_K(x[i]); i += inc_x; } @@ -80,3 +78,53 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) return(sumf); } +#if defined(SMP) +static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) +{ + *(FLOAT *)result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void *alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int (*function)(), int nthreads); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; + FLOAT * dummy_b; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 100000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/100000 ? num_cpu : n/100000; + + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT *ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, dummy_b, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) *2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif + return(sumf); +} + diff --git a/kernel/x86_64/drot.c b/kernel/x86_64/drot.c index a312b7ff9..66e9ff907 100644 --- a/kernel/x86_64/drot.c +++ b/kernel/x86_64/drot.c @@ -7,10 +7,76 @@ #endif #ifndef HAVE_DROT_KERNEL +#include "../simd/intrin.h" static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i = 0; +#if V_SIMD_F64 && V_SIMD > 256 + const int vstep = v_nlanes_f64; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + + v_f64 __c = v_setall_f64(c); + v_f64 __s = v_setall_f64(s); + v_f64 vx0, vx1, vx2, vx3; + v_f64 vy0, vy1, vy2, vy3; + v_f64 vt0, vt1, vt2, vt3; + + for (; i < unrollx4; i += vstep * 4) { + vx0 = v_loadu_f64(x + i); + vx1 = v_loadu_f64(x + i + vstep); + vx2 = v_loadu_f64(x + i + vstep * 2); + vx3 = v_loadu_f64(x + i + vstep * 3); + vy0 = v_loadu_f64(y + i); + vy1 = v_loadu_f64(y + i + vstep); + vy2 = v_loadu_f64(y + i + vstep * 2); + vy3 = v_loadu_f64(y + i + vstep * 3); + + vt0 = v_mul_f64(__s, vy0); + vt1 = v_mul_f64(__s, vy1); + vt2 = v_mul_f64(__s, vy2); + vt3 = v_mul_f64(__s, vy3); + + vt0 = v_muladd_f64(__c, vx0, vt0); + vt1 = v_muladd_f64(__c, vx1, vt1); + vt2 = v_muladd_f64(__c, vx2, vt2); + vt3 = v_muladd_f64(__c, vx3, vt3); + + v_storeu_f64(x + i, vt0); + v_storeu_f64(x + i + vstep, vt1); + v_storeu_f64(x + i + vstep * 2, vt2); + v_storeu_f64(x + i + vstep * 3, vt3); + + vt0 = v_mul_f64(__s, vx0); + vt1 = v_mul_f64(__s, vx1); + vt2 = v_mul_f64(__s, vx2); + vt3 = v_mul_f64(__s, vx3); + + vt0 = v_mulsub_f64(__c, vy0, vt0); + vt1 = v_mulsub_f64(__c, vy1, vt1); + vt2 = v_mulsub_f64(__c, vy2, vt2); + vt3 = v_mulsub_f64(__c, vy3, vt3); + + v_storeu_f64(y + i, vt0); + v_storeu_f64(y + i + vstep, vt1); + v_storeu_f64(y + i + vstep * 2, vt2); + v_storeu_f64(y + i + vstep * 3, vt3); + } + + for (; i < unrollx; i += vstep) { + vx0 = v_loadu_f64(x + i); + vy0 = v_loadu_f64(y + i); + + vt0 = v_mul_f64(__s, vy0); + vt0 = v_muladd_f64(__c, vx0, vt0); + v_storeu_f64(x + i, vt0); + + vt0 = v_mul_f64(__s, vx0); + vt0 = v_mulsub_f64(__c, vy0, vt0); + v_storeu_f64(y + i, vt0); + } +#else FLOAT f0, f1, f2, f3; FLOAT x0, x1, x2, x3; FLOAT g0, g1, g2, g3; @@ -53,7 +119,7 @@ static void drot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) yp += 4; i += 4; } - +#endif while (i < n) { FLOAT temp = c*x[i] + s*y[i]; y[i] = c*y[i] - s*x[i]; diff --git a/kernel/x86_64/sasum.c b/kernel/x86_64/sasum.c index 36ec4a737..d0cea9bee 100644 --- a/kernel/x86_64/sasum.c +++ b/kernel/x86_64/sasum.c @@ -67,24 +67,71 @@ static FLOAT sasum_kernel(BLASLONG n, FLOAT *x1) #endif -FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +static FLOAT asum_compute(BLASLONG n, FLOAT * x, BLASLONG inc_x) { - BLASLONG i=0; + BLASLONG i = 0; FLOAT sumf = 0.0; + + if (n <= 0 || inc_x <= 0) return (sumf); - if (n <= 0 || inc_x <= 0) return(sumf); - - if ( inc_x == 1 ) { + if (inc_x == 1) { sumf = sasum_kernel(n, x); } else { - n *= inc_x; while(i < n) { sumf += ABS_K(x[i]); i += inc_x; } - } + return (sumf); +} + +#if defined(SMP) +static int asum_thread_function(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy2, FLOAT *x, BLASLONG inc_x, FLOAT *dummy3, BLASLONG dummy4, FLOAT *result, BLASLONG dummy5) +{ + *(FLOAT *)result = asum_compute(n, x, inc_x); + return 0; +} + +extern int blas_level1_thread_with_return_value(int mode, BLASLONG m, BLASLONG n, BLASLONG k, void * alpha, void *a, BLASLONG lda, void *b, BLASLONG ldb, void *c, BLASLONG ldc, int(*function)(), int nthreads); +#endif + +FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) +{ +#if defined(SMP) + int nthreads; + FLOAT dummy_alpha; +#endif + FLOAT sumf = 0.0; + +#if defined(SMP) + int num_cpu = num_cpu_avail(1); + if (n <= 100000 || inc_x <= 0) + nthreads = 1; + else + nthreads = num_cpu < n/100000 ? num_cpu : n/100000; + if (nthreads == 1) { + sumf = asum_compute(n, x, inc_x); + } + else { + int mode, i; + char result[MAX_CPU_NUMBER * sizeof(double) *2]; + FLOAT * ptr; +#if !defined(DOUBLE) + mode = BLAS_SINGLE | BLAS_REAL; +#else + mode = BLAS_DOUBLE | BLAS_REAL; +#endif + blas_level1_thread_with_return_value(mode, n, 0, 0, &dummy_alpha, x, inc_x, NULL, 0, result, 0, (void *)asum_thread_function, nthreads); + ptr = (FLOAT *)result; + for (i = 0; i < nthreads; i++) { + sumf += (*ptr); + ptr = (FLOAT *)(((char *)ptr) + sizeof(double) * 2); + } + } +#else + sumf = asum_compute(n, x, inc_x); +#endif return(sumf); } diff --git a/kernel/x86_64/srot.c b/kernel/x86_64/srot.c index 021c20d82..3de586cb8 100644 --- a/kernel/x86_64/srot.c +++ b/kernel/x86_64/srot.c @@ -7,10 +7,78 @@ #endif #ifndef HAVE_SROT_KERNEL +#include"../simd/intrin.h" static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) { BLASLONG i = 0; + +#if V_SIMD && (defined(HAVE_FMA3) || V_SIMD > 128) + const int vstep = v_nlanes_f32; + const int unrollx4 = n & (-vstep * 4); + const int unrollx = n & -vstep; + + v_f32 __c = v_setall_f32(c); + v_f32 __s = v_setall_f32(s); + v_f32 vx0, vx1, vx2, vx3; + v_f32 vy0, vy1, vy2, vy3; + v_f32 vt0, vt1, vt2, vt3; + + for (; i < unrollx4; i += vstep * 4) { + vx0 = v_loadu_f32(x + i); + vx1 = v_loadu_f32(x + i + vstep); + vx2 = v_loadu_f32(x + i + vstep * 2); + vx3 = v_loadu_f32(x + i + vstep * 3); + vy0 = v_loadu_f32(y + i); + vy1 = v_loadu_f32(y + i + vstep); + vy2 = v_loadu_f32(y + i + vstep * 2); + vy3 = v_loadu_f32(y + i + vstep * 3); + + vt0 = v_mul_f32(__s, vy0); + vt1 = v_mul_f32(__s, vy1); + vt2 = v_mul_f32(__s, vy2); + vt3 = v_mul_f32(__s, vy3); + + vt0 = v_muladd_f32(__c, vx0, vt0); + vt1 = v_muladd_f32(__c, vx1, vt1); + vt2 = v_muladd_f32(__c, vx2, vt2); + vt3 = v_muladd_f32(__c, vx3, vt3); + + v_storeu_f32(x + i, vt0); + v_storeu_f32(x + i + vstep, vt1); + v_storeu_f32(x + i + vstep * 2, vt2); + v_storeu_f32(x + i + vstep * 3, vt3); + + vt0 = v_mul_f32(__s, vx0); + vt1 = v_mul_f32(__s, vx1); + vt2 = v_mul_f32(__s, vx2); + vt3 = v_mul_f32(__s, vx3); + + vt0 = v_mulsub_f32(__c, vy0, vt0); + vt1 = v_mulsub_f32(__c, vy1, vt1); + vt2 = v_mulsub_f32(__c, vy2, vt2); + vt3 = v_mulsub_f32(__c, vy3, vt3); + + v_storeu_f32(y + i, vt0); + v_storeu_f32(y + i + vstep, vt1); + v_storeu_f32(y + i + vstep * 2, vt2); + v_storeu_f32(y + i + vstep * 3, vt3); + + } + + for (; i < unrollx; i += vstep) { + vx0 = v_loadu_f32(x + i); + vy0 = v_loadu_f32(y + i); + + vt0 = v_mul_f32(__s, vy0); + vt0 = v_muladd_f32(__c, vx0, vt0); + v_storeu_f32(x + i, vt0); + + vt0 = v_mul_f32(__s, vx0); + vt0 = v_mulsub_f32(__c, vy0, vt0); + v_storeu_f32(y + i, vt0); + } +#else FLOAT f0, f1, f2, f3; FLOAT x0, x1, x2, x3; FLOAT g0, g1, g2, g3; @@ -20,7 +88,6 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) FLOAT* yp = y; BLASLONG n1 = n & (~7); - while (i < n1) { x0 = xp[0]; y0 = yp[0]; @@ -53,6 +120,7 @@ static void srot_kernel(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s) yp += 4; i += 4; } +#endif while (i < n) { FLOAT temp = c*x[i] + s*y[i]; diff --git a/lapack/laswp/riscv64/Makefile b/lapack/laswp/riscv64/Makefile new file mode 100644 index 000000000..75411deb5 --- /dev/null +++ b/lapack/laswp/riscv64/Makefile @@ -0,0 +1,13 @@ +TOPDIR = ../../.. +include ../../../Makefile.system + +ifndef LASWP +LASWP = ../generic/laswp_k.c +endif + +ifndef ZLASWP +ZLASWP = ../generic/zlaswp_k.c +endif + +include ../generic/Makefile + diff --git a/param.h b/param.h index 2047e4776..7789c83c7 100644 --- a/param.h +++ b/param.h @@ -2676,6 +2676,84 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SYMV_P 16 #endif +#ifdef RISCV64_GENERIC +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 2 +#define SGEMM_DEFAULT_UNROLL_N 2 + +#define DGEMM_DEFAULT_UNROLL_M 2 +#define DGEMM_DEFAULT_UNROLL_N 2 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 128 +#define DGEMM_DEFAULT_P 128 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 120 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + +#ifdef C910V +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 +#define GEMM_DEFAULT_ALIGN 0x03fffUL + +#define SGEMM_DEFAULT_UNROLL_M 16 +#define SGEMM_DEFAULT_UNROLL_N 4 + +#define DGEMM_DEFAULT_UNROLL_M 8 +#define DGEMM_DEFAULT_UNROLL_N 4 + +#define CGEMM_DEFAULT_UNROLL_M 2 +#define CGEMM_DEFAULT_UNROLL_N 2 + +#define ZGEMM_DEFAULT_UNROLL_M 2 +#define ZGEMM_DEFAULT_UNROLL_N 2 + +#define SGEMM_DEFAULT_P 160 +#define DGEMM_DEFAULT_P 160 +#define CGEMM_DEFAULT_P 96 +#define ZGEMM_DEFAULT_P 64 + +#define SGEMM_DEFAULT_Q 240 +#define DGEMM_DEFAULT_Q 128 +#define CGEMM_DEFAULT_Q 120 +#define ZGEMM_DEFAULT_Q 120 + +#define SGEMM_DEFAULT_R 12288 +#define DGEMM_DEFAULT_R 8192 +#define CGEMM_DEFAULT_R 4096 +#define ZGEMM_DEFAULT_R 4096 + +#define SYMV_P 16 + +#define GEMM_DEFAULT_OFFSET_A 0 +#define GEMM_DEFAULT_OFFSET_B 0 + +#endif + #ifdef ARMV7 #define SNUMOPT 2 #define DNUMOPT 2 diff --git a/test/Makefile b/test/Makefile index eb3bc3447..1ecce0be7 100644 --- a/test/Makefile +++ b/test/Makefile @@ -258,6 +258,12 @@ endif FLDFLAGS = $(FFLAGS:-fPIC=) $(LDFLAGS) + +ifeq ($(CORE), C910V) +EXTRALIB = +CEXTRALIB = +endif + ifeq ($(USE_OPENMP), 1) ifeq ($(F_COMPILER), GFORTRAN) ifeq ($(C_COMPILER), CLANG) diff --git a/utest/CMakeLists.txt b/utest/CMakeLists.txt index dc5175fc5..357e61301 100644 --- a/utest/CMakeLists.txt +++ b/utest/CMakeLists.txt @@ -27,13 +27,17 @@ endif () # known to hang with the native Windows and Android threads # FIXME needs checking if this works on any of the other platforms -if (NOT USE_OPENMP) if (OS_CYGWIN_NT OR OS_LINUX) +if (NOT USE_OPENMP) set(OpenBLAS_utest_src ${OpenBLAS_utest_src} test_fork.c ) endif() +set(OpenBLAS_utest_src + ${OpenBLAS_utest_src} + test_post_fork.c + ) endif() if (NOT NO_LAPACK) diff --git a/utest/Makefile b/utest/Makefile index 31d4ccf00..ac8c6f72a 100644 --- a/utest/Makefile +++ b/utest/Makefile @@ -25,10 +25,11 @@ endif #this does not work with OpenMP nor with native Windows or Android threads # FIXME TBD if this works on OSX, SunOS, POWER and zarch -ifndef USE_OPENMP ifeq ($(OSNAME), $(filter $(OSNAME),Linux CYGWIN_NT)) +ifneq ($(USE_OPENMP), 1) OBJS += test_fork.o endif +OBJS += test_post_fork.o endif ifeq ($(C_COMPILER), PGI) diff --git a/utest/test_fork.c b/utest/test_fork.c index 5c976f920..bd531e7fb 100644 --- a/utest/test_fork.c +++ b/utest/test_fork.c @@ -36,7 +36,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include #include "openblas_utest.h" -void* xmalloc(size_t n) +static void* xmalloc(size_t n) { void* tmp; tmp = malloc(n); @@ -49,7 +49,7 @@ void* xmalloc(size_t n) } #ifdef BUILD_DOUBLE -void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) +static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) { char trans1 = 'T'; char trans2 = 'N'; diff --git a/utest/test_post_fork.c b/utest/test_post_fork.c new file mode 100644 index 000000000..9370a02ce --- /dev/null +++ b/utest/test_post_fork.c @@ -0,0 +1,131 @@ +/***************************************************************************** +Copyright (c) 2011-2020, The OpenBLAS Project +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: + + 1. Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + 2. Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in + the documentation and/or other materials provided with the + distribution. + 3. Neither the name of the OpenBLAS project nor the names of + its contributors may be used to endorse or promote products + derived from this software without specific prior written + permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +**********************************************************************************/ + +#include +#include +#include +#ifdef USE_OPENMP +#include +#endif +#include "openblas_utest.h" + +static void* xmalloc(size_t n) +{ + void* tmp; + tmp = malloc(n); + if (tmp == NULL) { + fprintf(stderr, "You are about to die\n"); + exit(1); + } else { + return tmp; + } +} + +#ifdef BUILD_DOUBLE +static void check_dgemm(double *a, double *b, double *result, double *expected, blasint n) +{ + char trans1 = 'T'; + char trans2 = 'N'; + double zerod = 0, oned = 1; + int i; + BLASFUNC(dgemm)(&trans1, &trans2, &n, &n, &n, &oned, a, &n, b, &n, &zerod, result, &n); + for(i = 0; i < n * n; ++i) { + ASSERT_DBL_NEAR_TOL(expected[i], result[i], DOUBLE_EPS); + } +} +#endif + +CTEST(fork, safety_after_fork_in_parent) +{ +#ifndef BUILD_DOUBLE +exit(0); +#else + blasint n = 100; + int i, nthreads_omp; + + double *a, *b, *c, *d; + size_t n_bytes; + + pid_t fork_pid; + + n_bytes = sizeof(*a) * n * n; + + a = xmalloc(n_bytes); + b = xmalloc(n_bytes); + c = xmalloc(n_bytes); + d = xmalloc(n_bytes); + + // Put ones in a, b and n in c (result) + for(i = 0; i < n * n; ++i) { + a[i] = 1; + b[i] = 1; + c[i] = 1 * n; + } + + // Test that OpenBLAS works after a fork. + // This situation routinely happens with Pythons numpy where a + // `sys.platform` calls `uname` in a forked process. + // So we simulate this situation here. + + // There was an issue where a different number of OpenBLAS and OpenMP + // threads triggered a memory leak. So run this multiple times + // with different number of threads set. +#ifdef USE_OPENMP + nthreads_omp = omp_get_max_threads(); + // Run with half the max OMP threads, the max threads and twice that + for(i = (nthreads_omp + 1) / 2; i <= nthreads_omp * 2; i *= 2) { + omp_set_num_threads(i); +#endif + + fork_pid = fork(); + if (fork_pid == -1) { + CTEST_ERR("Failed to fork process."); + } else if (fork_pid == 0) { + // Just pretend to do something, e.g. call `uname`, then exit + exit(0); + } else { + // Wait for the child to finish and check the exit code. + int child_status = 0; + pid_t wait_pid = wait(&child_status); + ASSERT_EQUAL(wait_pid, fork_pid); + ASSERT_EQUAL(0, WEXITSTATUS (child_status)); + + // Now OpenBLAS has to work + check_dgemm(a, b, d, c, n); + } +#ifdef USE_OPENMP + } +#endif + +#endif +}