From 5d0d1c555195a391fe5d029427dfbf7b942ecdf9 Mon Sep 17 00:00:00 2001 From: Heller Zheng Date: Tue, 15 Nov 2022 18:22:21 -0800 Subject: [PATCH] Remove redundant files --- Makefile.install | 5 - kernel/riscv64/KERNEL.x280 | 36 +- kernel/riscv64/gemm_ncopy_2_rvv.c | 92 --- kernel/riscv64/gemm_ncopy_4_rvv.c | 123 ---- kernel/riscv64/gemm_tcopy_2_rvv.c | 108 ---- kernel/riscv64/gemm_tcopy_4_rvv.c | 236 -------- kernel/riscv64/gemmkernel_2x2_rvv.c | 214 ------- kernel/riscv64/gemmkernel_4x4_rvv.c | 508 ---------------- kernel/riscv64/trmmkernel_2x2_rvv.c | 342 ----------- kernel/riscv64/trmmkernel_4x4_rvv.c | 881 ---------------------------- 10 files changed, 2 insertions(+), 2543 deletions(-) delete mode 100644 kernel/riscv64/gemm_ncopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_ncopy_4_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_2_rvv.c delete mode 100644 kernel/riscv64/gemm_tcopy_4_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/gemmkernel_4x4_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_2x2_rvv.c delete mode 100644 kernel/riscv64/trmmkernel_4x4_rvv.c diff --git a/Makefile.install b/Makefile.install index f1adaa271..168d08f72 100644 --- a/Makefile.install +++ b/Makefile.install @@ -8,7 +8,6 @@ PREFIX ?= /opt/OpenBLAS OPENBLAS_INCLUDE_DIR := $(PREFIX)/include OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib OPENBLAS_BINARY_DIR := $(PREFIX)/bin -OPENBLAS_RELEASE_DIR := $(PREFIX)/release OPENBLAS_BUILD_DIR := $(CURDIR) OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE) OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake @@ -39,7 +38,6 @@ install : lib.grd @-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)" - @-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)" @-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)" @echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR) @@ -204,8 +202,5 @@ endif @echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)" @echo Install OK! -#Generating release tar - @echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz - @tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release . diff --git a/kernel/riscv64/KERNEL.x280 b/kernel/riscv64/KERNEL.x280 index 2eb60f2b4..4d64354fb 100644 --- a/kernel/riscv64/KERNEL.x280 +++ b/kernel/riscv64/KERNEL.x280 @@ -122,23 +122,7 @@ CTRMMKERNEL = ztrmmkernel_2x2_rvv.c ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c # SGEMM_UNROLL_N set in params.h -ifeq ($(SGEMM_UNROLL_N), 2) -SGEMMKERNEL = gemmkernel_2x2_rvv.c -SGEMMONCOPY = gemm_ncopy_2_rvv.c -SGEMMOTCOPY = gemm_tcopy_2_rvv.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 4) -SGEMMKERNEL = gemmkernel_4x4_rvv.c -SGEMMONCOPY = gemm_ncopy_4_rvv.c -SGEMMOTCOPY = ../generic/gemm_tcopy_4.c -SGEMMONCOPYOBJ = sgemm_oncopy.o -SGEMMOTCOPYOBJ = sgemm_otcopy.o - -STRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(SGEMM_UNROLL_N), 8) +ifeq ($(SGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX SGEMMKERNEL = gemmkernel_rvv_v1x8.c SGEMMINCOPY = gemm_ncopy_rvv_v1.c @@ -162,23 +146,7 @@ SSYMMLCOPY_M = symm_lcopy_rvv_v1.c endif # SGEMM_UNROLL_N set in params.h -ifeq ($(DGEMM_UNROLL_N), 2) -DGEMMKERNEL = gemmkernel_2x2_rvv.c -DGEMMONCOPY = gemm_ncopy_2_rvv.c -DGEMMOTCOPY = gemm_tcopy_2_rvv.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_2x2_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 4) -DGEMMKERNEL = gemmkernel_4x4_rvv.c -DGEMMONCOPY = gemm_ncopy_4_rvv.c -DGEMMOTCOPY = ../generic/gemm_tcopy_4.c -DGEMMONCOPYOBJ = dgemm_oncopy.o -DGEMMOTCOPYOBJ = dgemm_otcopy.o - -DTRMMKERNEL = trmmkernel_4x4_rvv.c -else ifeq ($(DGEMM_UNROLL_N), 8) +ifeq ($(DGEMM_UNROLL_N), 8) # UNROLL_M is VLMAX DGEMMKERNEL = gemmkernel_rvv_v1x8.c DGEMMINCOPY = gemm_ncopy_rvv_v1.c diff --git a/kernel/riscv64/gemm_ncopy_2_rvv.c b/kernel/riscv64/gemm_ncopy_2_rvv.c deleted file mode 100644 index 5f55bc349..000000000 --- a/kernel/riscv64/gemm_ncopy_2_rvv.c +++ /dev/null @@ -1,92 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define FLOAT_V_T vfloat32m4_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VSEV_FLOAT vse32_v_f32m4 -#define VSSEG2_FLOAT vsseg2e32_v_f32m4 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define FLOAT_V_T vfloat64m4_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VSEV_FLOAT vse64_v_f64m4 -#define VSSEG2_FLOAT vsseg2e64_v_f64m4 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset; - FLOAT_V_T v1, v2; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - - for(j = (n >> 1); j > 0; j--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset, vl); - VSEV_FLOAT(b_offset, v1, vl); - - a_offset += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_ncopy_4_rvv.c b/kernel/riscv64/gemm_ncopy_4_rvv.c deleted file mode 100644 index 4d4efe4c9..000000000 --- a/kernel/riscv64/gemm_ncopy_4_rvv.c +++ /dev/null @@ -1,123 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VSEV_FLOAT vse32_v_f32m2 -#define VSSEG2_FLOAT vsseg2e32_v_f32m2 -#define VSSEG4_FLOAT vsseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VSEV_FLOAT vse64_v_f64m2 -#define VSSEG2_FLOAT vsseg2e64_v_f64m2 -#define VSSEG4_FLOAT vsseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_ncopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset; - - FLOAT_V_T v1, v2, v3, v4; - size_t vl; - - //fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - for(j = (n >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - v3 = VLEV_FLOAT(a_offset3, vl); - v4 = VLEV_FLOAT(a_offset4, vl); - - VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl); - - a_offset1 += vl; - a_offset2 += vl; - a_offset3 += vl; - a_offset4 += vl; - b_offset += vl*4; - } - } - - if (n & 2) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - v2 = VLEV_FLOAT(a_offset2, vl); - - VSSEG2_FLOAT(b_offset, v1, v2, vl); - - a_offset1 += vl; - a_offset2 += vl; - b_offset += vl*2; - } - } - - if (n & 1) { - a_offset1 = a_offset; - - for(i = m; i > 0; i -= vl) { - vl = VSETVL(i); - - v1 = VLEV_FLOAT(a_offset1, vl); - - VSEV_FLOAT(b_offset, v1, vl); - - a_offset1 += vl; - b_offset += vl; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_2_rvv.c b/kernel/riscv64/gemm_tcopy_2_rvv.c deleted file mode 100644 index 963e1be69..000000000 --- a/kernel/riscv64/gemm_tcopy_2_rvv.c +++ /dev/null @@ -1,108 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_2.c - -int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b) -{ - BLASLONG i, j; - IFLOAT *a_offset, *a_offset1, *a_offset2; - IFLOAT *b_offset, *b_offset1, *b_offset2; - FLOAT_V_T v1a, v1b, v2a, v2b; - size_t vl; - - //fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU - - a_offset = a; - b_offset = b; - b_offset2 = b + m * (n & ~1); - - for(i = (m >> 1); i > 0; i--) { - - a_offset1 = a_offset; - a_offset2 = a_offset + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 4; - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl); - VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl); - - VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl); - - a_offset1 += vl * 2; - a_offset2 += vl * 2; - b_offset1 += vl * m * 2; - } - - if (n & 1) { - *(b_offset2 + 0) = *(a_offset1 + 0); - *(b_offset2 + 1) = *(a_offset2 + 0); - b_offset2 += 2; - } - } - - if (m & 1) { - - for(j = (n >> 1); j > 0; j -= vl) { - vl = VSETVL(j); - - VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl); - - VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl); - - a_offset += vl * 2; - b_offset += vl * m * 2; - } - - if (n & 1){ - *(b_offset2 + 0) = *(a_offset + 0); - } - } - - return 0; -} diff --git a/kernel/riscv64/gemm_tcopy_4_rvv.c b/kernel/riscv64/gemm_tcopy_4_rvv.c deleted file mode 100644 index ac9974b24..000000000 --- a/kernel/riscv64/gemm_tcopy_4_rvv.c +++ /dev/null @@ -1,236 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define FLOAT_V_T vfloat32m2_t -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VSSSEG2_FLOAT vssseg2e32_v_f32m2 -#define VSSSEG4_FLOAT vssseg4e32_v_f32m2 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define FLOAT_V_T vfloat64m2_t -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VSSSEG2_FLOAT vssseg2e64_v_f64m2 -#define VSSSEG4_FLOAT vssseg4e64_v_f64m2 -#endif - -// Optimizes the implementation in ../generic/gemm_tcopy_4.c - -int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b) -{ - BLASLONG i, j; - - FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4; - FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3; - FLOAT ctemp1, ctemp2, ctemp3, ctemp4; - FLOAT ctemp5, ctemp6, ctemp7, ctemp8; - FLOAT ctemp9, ctemp10, ctemp11, ctemp12; - FLOAT ctemp13, ctemp14, ctemp15, ctemp16; - - //fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda); - - a_offset = a; - b_offset = b; - - b_offset2 = b + m * (n & ~3); - b_offset3 = b + m * (n & ~1); - - for(j = (m >> 2); j > 0; j--) { - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset3 = a_offset2 + lda; - a_offset4 = a_offset3 + lda; - a_offset += 4 * lda; - - b_offset1 = b_offset; - b_offset += 16; - - for(i = (n >> 2); i > 0; i--) { - v1 = VLEV_FLOAT(a_offset1, 4); - v2 = VLEV_FLOAT(a_offset2, 4); - v3 = VLEV_FLOAT(a_offset3, 4); - v4 = VLEV_FLOAT(a_offset4, 4); - - a_offset1 += 4; - a_offset2 += 4; - a_offset3 += 4; - a_offset4 += 4; - - VSEV_FLOAT(b_offset1, v1, 4); - VSEV_FLOAT(b_offset2+4, v2, 4); - VSEV_FLOAT(b_offset2+8, v3, 4); - VSEV_FLOAT(b_offset2+12, v4, 4); - - b_offset1 += m * 4; - } - - if (n & 2) { - v1 = VLEV_FLOAT(a_offset1, 2); - v2 = VLEV_FLOAT(a_offset2, 2); - v3 = VLEV_FLOAT(a_offset3, 2); - v4 = VLEV_FLOAT(a_offset4, 2); - - a_offset1 += 2; - a_offset2 += 2; - a_offset3 += 2; - a_offset4 += 2; - - VSEV_FLOAT(b_offset2, v1, 2); - VSEV_FLOAT(b_offset2+2, v2, 2); - VSEV_FLOAT(b_offset2+4, v3, 2); - VSEV_FLOAT(b_offset2+6, v4, 2); - - b_offset2 += 8; - } - - if (n & 1) { - v1 = VLEV_FLOAT(a_offset1, 1); - v2 = VLEV_FLOAT(a_offset2, 1); - v3 = VLEV_FLOAT(a_offset3, 1); - v4 = VLEV_FLOAT(a_offset4, 1); - - VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1); - - b_offset3 += 4; - } - - } - -// TODO cleanup - - if (m & 2){ - a_offset1 = a_offset; - a_offset2 = a_offset1 + lda; - a_offset += 2 * lda; - - b_offset1 = b_offset; - b_offset += 8; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - ctemp5 = *(a_offset2 + 0); - ctemp6 = *(a_offset2 + 1); - ctemp7 = *(a_offset2 + 2); - ctemp8 = *(a_offset2 + 3); - - a_offset1 += 4; - a_offset2 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - *(b_offset1 + 4) = ctemp5; - *(b_offset1 + 5) = ctemp6; - *(b_offset1 + 6) = ctemp7; - *(b_offset1 + 7) = ctemp8; - - b_offset1 += m * 4; - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - - ctemp3 = *(a_offset2 + 0); - ctemp4 = *(a_offset2 + 1); - - a_offset1 += 2; - a_offset2 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - *(b_offset2 + 2) = ctemp3; - *(b_offset2 + 3) = ctemp4; - - b_offset2 += 4; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset2 + 0); - - *(b_offset3 + 0) = ctemp1; - *(b_offset3 + 1) = ctemp2; - b_offset3 += 2; - } - } - - if (m & 1){ - a_offset1 = a_offset; - b_offset1 = b_offset; - - i = (n >> 2); - if (i > 0){ - do{ - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - ctemp3 = *(a_offset1 + 2); - ctemp4 = *(a_offset1 + 3); - - a_offset1 += 4; - - *(b_offset1 + 0) = ctemp1; - *(b_offset1 + 1) = ctemp2; - *(b_offset1 + 2) = ctemp3; - *(b_offset1 + 3) = ctemp4; - - b_offset1 += 4 * m; - - i --; - }while(i > 0); - } - - if (n & 2) { - ctemp1 = *(a_offset1 + 0); - ctemp2 = *(a_offset1 + 1); - a_offset1 += 2; - - *(b_offset2 + 0) = ctemp1; - *(b_offset2 + 1) = ctemp2; - } - - if (n & 1) { - ctemp1 = *(a_offset1 + 0); - *(b_offset3 + 0) = ctemp1; - } - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_2x2_rvv.c b/kernel/riscv64/gemmkernel_2x2_rvv.c deleted file mode 100644 index ec8961ced..000000000 --- a/kernel/riscv64/gemmkernel_2x2_rvv.c +++ /dev/null @@ -1,214 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG2_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG2_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/2; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/2; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/gemmkernel_4x4_rvv.c b/kernel/riscv64/gemmkernel_4x4_rvv.c deleted file mode 100644 index aa58bcc76..000000000 --- a/kernel/riscv64/gemmkernel_4x4_rvv.c +++ /dev/null @@ -1,508 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m1(n) -#define VSETVL_MAX vsetvlmax_e32m1() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m1_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m1 -#define VLSEG2_FLOAT vlseg2e32_v_f32m1 -#define VLSEG4_FLOAT vlseg4e32_v_f32m1 -#define VFMVVF_FLOAT vfmv_v_f_f32m1 -#define VFMACCVF_FLOAT vfmacc_vf_f32m1 -#define VFMACCVV_FLOAT vfmacc_vv_f32m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m1(n) -#define VSETVL_MAX vsetvlmax_e64m1() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m1_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m1 -#define VLSEG2_FLOAT vlseg2e64_v_f64m1 -#define VLSEG4_FLOAT vlseg4e64_v_f64m1 -#define VFMVVF_FLOAT vfmv_v_f_f64m1 -#define VFMACCVF_FLOAT vfmacc_vf_f64m1 -#define VFMACCVV_FLOAT vfmacc_vv_f64m1 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - -// Optimizes the implementation in ../generic/gemm_kernel_2x2.c - -int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3; - IFLOAT *ptrba,*ptrbb; - - //fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU - - FLOAT_V_T va0, va1, va2, va3; - FLOAT_V_T vb0, vb1, vb2, vb3; - FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7; - FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3; - FLOAT_V_T_M1 v_z0; - - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vlmax = VSETVL_MAX; - size_t vl; - - for (j = bn/4; j > 0; j--) { - C0 = C; - C1 = C0 + ldc; - C2 = C1 + ldc; - C3 = C2 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - vres8 = VFMVVF_FLOAT(0.0, vlmax); - vres9 = VFMVVF_FLOAT(0.0, vlmax); - vres10 = VFMVVF_FLOAT(0.0, vlmax); - vres11 = VFMVVF_FLOAT(0.0, vlmax); - vres12 = VFMVVF_FLOAT(0.0, vlmax); - vres13 = VFMVVF_FLOAT(0.0, vlmax); - vres14 = VFMVVF_FLOAT(0.0, vlmax); - vres15 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl); - vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl); - vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl); - vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl); - - vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl); - vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl); - vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl); - vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl); - - ptrba += vl*4; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - C2 += 4; - C3 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl); - vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl); - vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl); - - ptrba += vl*2; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - C1 += 2; - C2 += 2; - C3 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl); - vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl*4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 1; - C1 += 1; - C2 += 1; - C3 += 1; - } - - bb += (bk<<2); - C += (ldc<<2); - } - - if(bn & 2) { - - C0 = C; - C1 = C0 + ldc; - ptrba = ba; - - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - vres4 = VFMVVF_FLOAT(0.0, vlmax); - vres5 = VFMVVF_FLOAT(0.0, vlmax); - vres6 = VFMVVF_FLOAT(0.0, vlmax); - vres7 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl); - vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl); - vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl); - vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl); - - ptrba += vl*4; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - C1 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl*2; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 2; - C1 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl*2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 1; - C1 += 1; - } - - bb += (bk<<1); - C += (ldc<<1); - } - - if(bn & 1) { - C0 = C; - ptrba = ba; - for (i = bm/4; i > 0; i--) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl); - vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl); - - ptrba += vl*4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3); - - C0 += 4; - } - - if(bm & 2) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - - ptrba += vl*2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1); - - C0 += 2; - } - - if(bm & 1) { - ptrbb = bb; - - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = bk; k > 0; k -= vl) { - vl = VSETVL(k); - - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax); - C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0); - - C0 += 1; - } - - bb += (bk<<0); - C += ldc; - } - - return 0; -} diff --git a/kernel/riscv64/trmmkernel_2x2_rvv.c b/kernel/riscv64/trmmkernel_2x2_rvv.c deleted file mode 100644 index 127e76970..000000000 --- a/kernel/riscv64/trmmkernel_2x2_rvv.c +++ /dev/null @@ -1,342 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m4(n) -#define VSETVL_MAX vsetvlmax_e32m4() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m4_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m4 -#define VLSEG_FLOAT vlseg2e32_v_f32m4 -#define VFMVVF_FLOAT vfmv_v_f_f32m4 -#define VFMACCVF_FLOAT vfmacc_vf_f32m4 -#define VFMACCVV_FLOAT vfmacc_vv_f32m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m4(n) -#define VSETVL_MAX vsetvlmax_e64m4() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m4_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m4 -#define VLSEG_FLOAT vlseg2e64_v_f64m4 -#define VFMVVF_FLOAT vfmv_v_f_f64m4 -#define VFMACCVF_FLOAT vfmacc_vf_f64m4 -#define VFMACCVV_FLOAT vfmacc_vv_f64m4 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_2x2.c - - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc -#ifdef TRMMKERNEL - ,BLASLONG offset -#endif - ) -{ - BLASLONG i,j,k; - FLOAT *C0,*C1,*ptrba,*ptrbb; - BLASLONG off, temp; - - FLOAT_V_T va0, va1, vb0, vb1; - FLOAT_V_T vres0, vres1, vres2, vres3; - FLOAT_V_T_M1 v_res, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - -#if defined(TRMMKERNEL) && !defined(LEFT) - off = -offset; -#else - off = 0; -#endif - - for (j = bn/2; j > 0; j--) - { - C0 = C; - C1 = C0+ldc; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - vres2 = VFMVVF_FLOAT(0.0, vlmax); - vres3 = VFMVVF_FLOAT(0.0, vlmax); - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl); - vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl); - vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax); - C1[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 2; -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 2; -#endif - C0 = C0+2; - C1 = C1+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off*2; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; -#else - temp = off+2; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl); - vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 2; -#endif - ptrba += temp; - ptrbb += temp*2; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - C1 = C1+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - if (bn & 1) - { - C0 = C; -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - ptrba = ba; - - for (i = bm/2; i > 0; i--) - { -#if (defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off; -#endif - - -#if (defined(LEFT) && !defined(TRANSA)) || \ - (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; -#else - temp = off+1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - vres1 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - vb0 = VLEV_FLOAT(ptrbb, vl); - VLSEG_FLOAT(&va0, &va1, ptrba, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl); - - ptrba += vl * 2; - ptrbb += vl; - - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax); - C0[1] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || \ - (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; -#else - temp -= 1; -#endif - ptrba += temp*2; - ptrbb += temp; -#endif -#ifdef LEFT - off += 2; -#endif - - C0 = C0+2; - } - - if (bm & 1) - { -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off; - ptrbb = bb+off; -#endif - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off + 1; -#else - temp = off + 1; -#endif - vres0 = VFMVVF_FLOAT(0.0, vlmax); - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl); - ptrba += vl; - ptrbb += vl; - } - v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(v_res); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk-off; -#ifdef LEFT - temp -= 1; -#else - temp -= 1; -#endif - ptrba += temp; - ptrbb += temp; -#endif -#ifdef LEFT - off += 1; -#endif - C0 = C0+1; - } -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -} - diff --git a/kernel/riscv64/trmmkernel_4x4_rvv.c b/kernel/riscv64/trmmkernel_4x4_rvv.c deleted file mode 100644 index 3e46c6348..000000000 --- a/kernel/riscv64/trmmkernel_4x4_rvv.c +++ /dev/null @@ -1,881 +0,0 @@ -/*************************************************************************** -Copyright (c) 2022, The OpenBLAS Project -All rights reserved. -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: -1. Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. -2. Redistributions in binary form must reproduce the above copyright -notice, this list of conditions and the following disclaimer in -the documentation and/or other materials provided with the -distribution. -3. Neither the name of the OpenBLAS project nor the names of -its contributors may be used to endorse or promote products -derived from this software without specific prior written permission. -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -*****************************************************************************/ - -#include "common.h" -#include - -#if !defined(DOUBLE) -#define VSETVL(n) vsetvl_e32m2(n) -#define VSETVL_MAX vsetvlmax_e32m2() -#define VSETVL_MAX_M1 vsetvlmax_e32m1() -#define FLOAT_V_T vfloat32m2_t -#define FLOAT_V_T_M1 vfloat32m1_t -#define VLEV_FLOAT vle32_v_f32m2 -#define VLSEG4_FLOAT vlseg4e32_v_f32m2 -#define VLSEG2_FLOAT vlseg2e32_v_f32m2 -#define VFMVVF_FLOAT vfmv_v_f_f32m2 -#define VFMUL_FLOAT vfmul_vv_f32m2 -#define VFMACCVF_FLOAT vfmacc_vf_f32m2 -#define VFMACCVV_FLOAT vfmacc_vv_f32m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32 -#else -#define VSETVL(n) vsetvl_e64m2(n) -#define VSETVL_MAX vsetvlmax_e64m2() -#define VSETVL_MAX_M1 vsetvlmax_e64m1() -#define FLOAT_V_T vfloat64m2_t -#define FLOAT_V_T_M1 vfloat64m1_t -#define VLEV_FLOAT vle64_v_f64m2 -#define VLSEG4_FLOAT vlseg4e64_v_f64m2 -#define VLSEG2_FLOAT vlseg2e64_v_f64m2 -#define VFMVVF_FLOAT vfmv_v_f_f64m2 -#define VFMUL_FLOAT vfmul_vv_f64m2 -#define VFMACCVF_FLOAT vfmacc_vf_f64m2 -#define VFMACCVV_FLOAT vfmacc_vv_f64m2 -#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1 -#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1 -#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64 -#endif - - -// Optimizes the implementation in ../generic/trmmkernel_4x4.c - -int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset) -{ - - BLASLONG i,j,k; - FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb; - - FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3; - FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0; - v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1); - size_t vl; - size_t vlmax = VSETVL_MAX; - - FLOAT_V_T vres0_0; - FLOAT_V_T vres0_1; - FLOAT_V_T vres0_2; - FLOAT_V_T vres0_3; - - FLOAT_V_T vres1_0; - FLOAT_V_T vres1_1; - FLOAT_V_T vres1_2; - FLOAT_V_T vres1_3; - - FLOAT_V_T vres2_0; - FLOAT_V_T vres2_1; - FLOAT_V_T vres2_2; - FLOAT_V_T vres2_3; - - FLOAT_V_T vres3_0; - FLOAT_V_T vres3_1; - FLOAT_V_T vres3_2; - FLOAT_V_T vres3_3; - - BLASLONG off, temp; - - bool left; - bool transposed; - bool backwards; - -#ifdef LEFT - left = true; -#else - left = false; -#endif - -#ifdef TRANSA - transposed = true; -#else - transposed = false; -#endif - - backwards = left != transposed; - - if (!left) { - off = -offset; - } - - - for (j=0; j 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl); - vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl); - vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl); - - ptrba += vl * 4; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - if (!backwards) { - temp = bk-off; - temp = left ? temp - 4 : // number of values in A - temp - 4; // number of values in B - - ptrba += temp*4; // number of values in A - ptrbb += temp*4; // number of values in B - } -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - C2 = C2+4; - C3 = C3+4; - - } - - if ( bm & 2 ) // do any 2x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres2_1 = VFMVVF_FLOAT(0, vlmax); - - vres3_0 = VFMVVF_FLOAT(0, vlmax); - vres3_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+4; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl); - vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl); - - ptrba += vl * 2; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax); - - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - C2 = C2+2; - C3 = C3+2; - - } - - if ( bm & 1 ) // do any 1x4 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*4; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres2_0 = VFMVVF_FLOAT(0, vlmax); - vres3_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+4; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl); - vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl); - - ptrba += vl; - ptrbb += vl * 4; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 4; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*4; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - C2 = C2+1; - C3 = C3+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 4; -#endif - - k = (bk<<2); - bb = bb+k; - i = (ldc<<2); - C = C+i; - } - - for (j=0; j<(bn&2); j+=2) // do the Mx2 loops - { - C0 = C; - C1 = C0+ldc; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl); - - ptrba += vl * 4; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - C1 = C1+4; - - } - - if ( bm & 2 ) // do any 2x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*2; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - - vres1_0 = VFMVVF_FLOAT(0, vlmax); - vres1_1 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+2; // number of values in B -#endif - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl); - - ptrba += vl * 2; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax); - - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2); - C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - C1 = C1+2; - - } - - if ( bm & 1 ) // do any 1x2 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*2; -#endif - - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres1_0 = VFMVVF_FLOAT(0, vlmax); - - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+2; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl); - - ptrba += vl; - ptrbb += vl * 2; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 2; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*2; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - C1 = C1+1; - - } - - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 2; -#endif - - k = (bk<<1); - bb = bb+k; - i = (ldc<<1); - C = C+i; - } - - for (j=0; j<(bn&1); j+=1) // do the Mx1 loops - { - C0 = C; - -#if defined(TRMMKERNEL) && defined(LEFT) - off = offset; -#endif - - ptrba = ba; - - for (i=0; i 0; k -= vl) - { - vl = VSETVL(k); - VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl); - - vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl); - - ptrba += vl * 4; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax); - vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2); - C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 4; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*4; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 4; // number of values in A -#endif - - C0 = C0+4; - - } - - if ( bm & 2 ) // do any 2x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*2; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - vres0_1 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+2; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - VLSEG2_FLOAT(&va0, &va1, ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl); - - ptrba += vl * 2; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 2; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*2; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 2; // number of values in A -#endif - - C0 = C0+2; - - } - - if ( bm & 1 ) // do any 1x1 loop - { - -#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - ptrbb = bb; -#else - ptrba += off*1; - ptrbb = bb + off*1; -#endif - - vres0_0 = VFMVVF_FLOAT(0, vlmax); - -#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA)) - temp = bk-off; -#elif defined(LEFT) - temp = off+1; // number of values in A -#else - temp = off+1; // number of values in B -#endif - - for (k = temp; k > 0; k -= vl) - { - vl = VSETVL(k); - va0 = VLEV_FLOAT(ptrba, vl); - vb0 = VLEV_FLOAT(ptrbb, vl); - - vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl); - - ptrba += vl; - ptrbb += vl; - } - - vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax); - C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0); - - -#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA)) - temp = bk - off; -#ifdef LEFT - temp -= 1; // number of values in A -#else - temp -= 1; // number of values in B -#endif - ptrba += temp*1; - ptrbb += temp*1; -#endif - -#ifdef LEFT - off += 1; // number of values in A -#endif - - C0 = C0+1; - - } - -#if defined(TRMMKERNEL) && !defined(LEFT) - off += 1; -#endif - - k = (bk<<0); - bb = bb+k; - C = C+ldc; - } - return 0; -}