Remove redundant files
This commit is contained in:
parent
bef47917bd
commit
5d0d1c5551
|
@ -8,7 +8,6 @@ PREFIX ?= /opt/OpenBLAS
|
|||
OPENBLAS_INCLUDE_DIR := $(PREFIX)/include
|
||||
OPENBLAS_LIBRARY_DIR := $(PREFIX)/lib
|
||||
OPENBLAS_BINARY_DIR := $(PREFIX)/bin
|
||||
OPENBLAS_RELEASE_DIR := $(PREFIX)/release
|
||||
OPENBLAS_BUILD_DIR := $(CURDIR)
|
||||
OPENBLAS_CMAKE_DIR := $(OPENBLAS_LIBRARY_DIR)/cmake/$(LIBSONAMEBASE)
|
||||
OPENBLAS_CMAKE_CONFIG := OpenBLASConfig.cmake
|
||||
|
@ -39,7 +38,6 @@ install : lib.grd
|
|||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_INCLUDE_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_LIBRARY_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_BINARY_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_RELEASE_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)"
|
||||
@-mkdir -p "$(DESTDIR)$(OPENBLAS_PKGCONFIG_DIR)"
|
||||
@echo Generating openblas_config.h in $(DESTDIR)$(OPENBLAS_INCLUDE_DIR)
|
||||
|
@ -204,8 +202,5 @@ endif
|
|||
@echo " endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo "endif ()" >> "$(DESTDIR)$(OPENBLAS_CMAKE_DIR)/$(OPENBLAS_CMAKE_CONFIG_VERSION)"
|
||||
@echo Install OK!
|
||||
#Generating release tar
|
||||
@echo Generating $(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz
|
||||
@tar -cvz --file=$(OPENBLAS_RELEASE_DIR)/$(basename $(LIBNAME)).tar.gz --directory=$(PREFIX) --exclude=release .
|
||||
|
||||
|
||||
|
|
|
@ -122,23 +122,7 @@ CTRMMKERNEL = ztrmmkernel_2x2_rvv.c
|
|||
ZTRMMKERNEL = ztrmmkernel_2x2_rvv.c
|
||||
|
||||
# SGEMM_UNROLL_N set in params.h
|
||||
ifeq ($(SGEMM_UNROLL_N), 2)
|
||||
SGEMMKERNEL = gemmkernel_2x2_rvv.c
|
||||
SGEMMONCOPY = gemm_ncopy_2_rvv.c
|
||||
SGEMMOTCOPY = gemm_tcopy_2_rvv.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
STRMMKERNEL = trmmkernel_2x2_rvv.c
|
||||
else ifeq ($(SGEMM_UNROLL_N), 4)
|
||||
SGEMMKERNEL = gemmkernel_4x4_rvv.c
|
||||
SGEMMONCOPY = gemm_ncopy_4_rvv.c
|
||||
SGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
SGEMMONCOPYOBJ = sgemm_oncopy.o
|
||||
SGEMMOTCOPYOBJ = sgemm_otcopy.o
|
||||
|
||||
STRMMKERNEL = trmmkernel_4x4_rvv.c
|
||||
else ifeq ($(SGEMM_UNROLL_N), 8)
|
||||
ifeq ($(SGEMM_UNROLL_N), 8)
|
||||
# UNROLL_M is VLMAX
|
||||
SGEMMKERNEL = gemmkernel_rvv_v1x8.c
|
||||
SGEMMINCOPY = gemm_ncopy_rvv_v1.c
|
||||
|
@ -162,23 +146,7 @@ SSYMMLCOPY_M = symm_lcopy_rvv_v1.c
|
|||
endif
|
||||
|
||||
# SGEMM_UNROLL_N set in params.h
|
||||
ifeq ($(DGEMM_UNROLL_N), 2)
|
||||
DGEMMKERNEL = gemmkernel_2x2_rvv.c
|
||||
DGEMMONCOPY = gemm_ncopy_2_rvv.c
|
||||
DGEMMOTCOPY = gemm_tcopy_2_rvv.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
DTRMMKERNEL = trmmkernel_2x2_rvv.c
|
||||
else ifeq ($(DGEMM_UNROLL_N), 4)
|
||||
DGEMMKERNEL = gemmkernel_4x4_rvv.c
|
||||
DGEMMONCOPY = gemm_ncopy_4_rvv.c
|
||||
DGEMMOTCOPY = ../generic/gemm_tcopy_4.c
|
||||
DGEMMONCOPYOBJ = dgemm_oncopy.o
|
||||
DGEMMOTCOPYOBJ = dgemm_otcopy.o
|
||||
|
||||
DTRMMKERNEL = trmmkernel_4x4_rvv.c
|
||||
else ifeq ($(DGEMM_UNROLL_N), 8)
|
||||
ifeq ($(DGEMM_UNROLL_N), 8)
|
||||
# UNROLL_M is VLMAX
|
||||
DGEMMKERNEL = gemmkernel_rvv_v1x8.c
|
||||
DGEMMINCOPY = gemm_ncopy_rvv_v1.c
|
||||
|
|
|
@ -1,92 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VSEV_FLOAT vse32_v_f32m4
|
||||
#define VSSEG2_FLOAT vsseg2e32_v_f32m4
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VSEV_FLOAT vse64_v_f64m4
|
||||
#define VSSEG2_FLOAT vsseg2e64_v_f64m4
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_ncopy_2.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
IFLOAT *a_offset, *a_offset1, *a_offset2;
|
||||
IFLOAT *b_offset;
|
||||
FLOAT_V_T v1, v2;
|
||||
size_t vl;
|
||||
|
||||
//fprintf(stderr, "gemm_ncopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for(j = (n >> 1); j > 0; j--) {
|
||||
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
v2 = VLEV_FLOAT(a_offset2, vl);
|
||||
VSSEG2_FLOAT(b_offset, v1, v2, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
a_offset2 += vl;
|
||||
b_offset += vl*2;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset, vl);
|
||||
VSEV_FLOAT(b_offset, v1, vl);
|
||||
|
||||
a_offset += vl;
|
||||
b_offset += vl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,123 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VSEV_FLOAT vse32_v_f32m2
|
||||
#define VSSEG2_FLOAT vsseg2e32_v_f32m2
|
||||
#define VSSEG4_FLOAT vsseg4e32_v_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VSEV_FLOAT vse64_v_f64m2
|
||||
#define VSSEG2_FLOAT vsseg2e64_v_f64m2
|
||||
#define VSSEG4_FLOAT vsseg4e64_v_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_ncopy_4.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
FLOAT *b_offset;
|
||||
|
||||
FLOAT_V_T v1, v2, v3, v4;
|
||||
size_t vl;
|
||||
|
||||
//fprintf(stderr, "gemm_ncopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda);
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
for(j = (n >> 2); j > 0; j--) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 4 * lda;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
v2 = VLEV_FLOAT(a_offset2, vl);
|
||||
v3 = VLEV_FLOAT(a_offset3, vl);
|
||||
v4 = VLEV_FLOAT(a_offset4, vl);
|
||||
|
||||
VSSEG4_FLOAT(b_offset, v1, v2, v3, v4, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
a_offset2 += vl;
|
||||
a_offset3 += vl;
|
||||
a_offset4 += vl;
|
||||
b_offset += vl*4;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
v2 = VLEV_FLOAT(a_offset2, vl);
|
||||
|
||||
VSSEG2_FLOAT(b_offset, v1, v2, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
a_offset2 += vl;
|
||||
b_offset += vl*2;
|
||||
}
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
a_offset1 = a_offset;
|
||||
|
||||
for(i = m; i > 0; i -= vl) {
|
||||
vl = VSETVL(i);
|
||||
|
||||
v1 = VLEV_FLOAT(a_offset1, vl);
|
||||
|
||||
VSEV_FLOAT(b_offset, v1, vl);
|
||||
|
||||
a_offset1 += vl;
|
||||
b_offset += vl;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,108 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
|
||||
#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
|
||||
#define VSSSEG4_FLOAT vssseg4e32_v_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
|
||||
#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
|
||||
#define VSSSEG4_FLOAT vssseg4e64_v_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_tcopy_2.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, IFLOAT *a, BLASLONG lda, IFLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
IFLOAT *a_offset, *a_offset1, *a_offset2;
|
||||
IFLOAT *b_offset, *b_offset1, *b_offset2;
|
||||
FLOAT_V_T v1a, v1b, v2a, v2b;
|
||||
size_t vl;
|
||||
|
||||
//fprintf(stderr, "gemm_tcopy_2 m=%ld n=%ld lda=%ld\n", m, n, lda); // KU
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
b_offset2 = b + m * (n & ~1);
|
||||
|
||||
for(i = (m >> 1); i > 0; i--) {
|
||||
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset += 4;
|
||||
|
||||
for(j = (n >> 1); j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
VLSEG2_FLOAT(&v1a, &v1b, a_offset1, vl);
|
||||
VLSEG2_FLOAT(&v2a, &v2b, a_offset2, vl);
|
||||
|
||||
VSSSEG4_FLOAT(b_offset1, m*2*sizeof(FLOAT), v1a, v1b, v2a, v2b, vl);
|
||||
|
||||
a_offset1 += vl * 2;
|
||||
a_offset2 += vl * 2;
|
||||
b_offset1 += vl * m * 2;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
*(b_offset2 + 0) = *(a_offset1 + 0);
|
||||
*(b_offset2 + 1) = *(a_offset2 + 0);
|
||||
b_offset2 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 1) {
|
||||
|
||||
for(j = (n >> 1); j > 0; j -= vl) {
|
||||
vl = VSETVL(j);
|
||||
|
||||
VLSEG2_FLOAT(&v1a, &v1b, a_offset, vl);
|
||||
|
||||
VSSSEG2_FLOAT(b_offset, m*2*sizeof(FLOAT), v1a, v1b, vl);
|
||||
|
||||
a_offset += vl * 2;
|
||||
b_offset += vl * m * 2;
|
||||
}
|
||||
|
||||
if (n & 1){
|
||||
*(b_offset2 + 0) = *(a_offset + 0);
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,236 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
|
||||
#define VSSSEG2_FLOAT vssseg2e32_v_f32m2
|
||||
#define VSSSEG4_FLOAT vssseg4e32_v_f32m2
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
|
||||
#define VSSSEG2_FLOAT vssseg2e64_v_f64m2
|
||||
#define VSSSEG4_FLOAT vssseg4e64_v_f64m2
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_tcopy_4.c
|
||||
|
||||
int CNAME(BLASLONG m, BLASLONG n, FLOAT *a, BLASLONG lda, FLOAT *b)
|
||||
{
|
||||
BLASLONG i, j;
|
||||
|
||||
FLOAT *a_offset, *a_offset1, *a_offset2, *a_offset3, *a_offset4;
|
||||
FLOAT *b_offset, *b_offset1, *b_offset2, *b_offset3;
|
||||
FLOAT ctemp1, ctemp2, ctemp3, ctemp4;
|
||||
FLOAT ctemp5, ctemp6, ctemp7, ctemp8;
|
||||
FLOAT ctemp9, ctemp10, ctemp11, ctemp12;
|
||||
FLOAT ctemp13, ctemp14, ctemp15, ctemp16;
|
||||
|
||||
//fprintf(stderr, "gemm_tcopy_4 m=%ld n=%ld lda=%ld\n", m, n, lda);
|
||||
|
||||
a_offset = a;
|
||||
b_offset = b;
|
||||
|
||||
b_offset2 = b + m * (n & ~3);
|
||||
b_offset3 = b + m * (n & ~1);
|
||||
|
||||
for(j = (m >> 2); j > 0; j--) {
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset3 = a_offset2 + lda;
|
||||
a_offset4 = a_offset3 + lda;
|
||||
a_offset += 4 * lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset += 16;
|
||||
|
||||
for(i = (n >> 2); i > 0; i--) {
|
||||
v1 = VLEV_FLOAT(a_offset1, 4);
|
||||
v2 = VLEV_FLOAT(a_offset2, 4);
|
||||
v3 = VLEV_FLOAT(a_offset3, 4);
|
||||
v4 = VLEV_FLOAT(a_offset4, 4);
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
a_offset3 += 4;
|
||||
a_offset4 += 4;
|
||||
|
||||
VSEV_FLOAT(b_offset1, v1, 4);
|
||||
VSEV_FLOAT(b_offset2+4, v2, 4);
|
||||
VSEV_FLOAT(b_offset2+8, v3, 4);
|
||||
VSEV_FLOAT(b_offset2+12, v4, 4);
|
||||
|
||||
b_offset1 += m * 4;
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
v1 = VLEV_FLOAT(a_offset1, 2);
|
||||
v2 = VLEV_FLOAT(a_offset2, 2);
|
||||
v3 = VLEV_FLOAT(a_offset3, 2);
|
||||
v4 = VLEV_FLOAT(a_offset4, 2);
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
a_offset3 += 2;
|
||||
a_offset4 += 2;
|
||||
|
||||
VSEV_FLOAT(b_offset2, v1, 2);
|
||||
VSEV_FLOAT(b_offset2+2, v2, 2);
|
||||
VSEV_FLOAT(b_offset2+4, v3, 2);
|
||||
VSEV_FLOAT(b_offset2+6, v4, 2);
|
||||
|
||||
b_offset2 += 8;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
v1 = VLEV_FLOAT(a_offset1, 1);
|
||||
v2 = VLEV_FLOAT(a_offset2, 1);
|
||||
v3 = VLEV_FLOAT(a_offset3, 1);
|
||||
v4 = VLEV_FLOAT(a_offset4, 1);
|
||||
|
||||
VSSEG4_FLOAT(b_offset3, v1, v2, v3, v4, 1);
|
||||
|
||||
b_offset3 += 4;
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
// TODO cleanup
|
||||
|
||||
if (m & 2){
|
||||
a_offset1 = a_offset;
|
||||
a_offset2 = a_offset1 + lda;
|
||||
a_offset += 2 * lda;
|
||||
|
||||
b_offset1 = b_offset;
|
||||
b_offset += 8;
|
||||
|
||||
i = (n >> 2);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp1 = *(a_offset1 + 0);
|
||||
ctemp2 = *(a_offset1 + 1);
|
||||
ctemp3 = *(a_offset1 + 2);
|
||||
ctemp4 = *(a_offset1 + 3);
|
||||
|
||||
ctemp5 = *(a_offset2 + 0);
|
||||
ctemp6 = *(a_offset2 + 1);
|
||||
ctemp7 = *(a_offset2 + 2);
|
||||
ctemp8 = *(a_offset2 + 3);
|
||||
|
||||
a_offset1 += 4;
|
||||
a_offset2 += 4;
|
||||
|
||||
*(b_offset1 + 0) = ctemp1;
|
||||
*(b_offset1 + 1) = ctemp2;
|
||||
*(b_offset1 + 2) = ctemp3;
|
||||
*(b_offset1 + 3) = ctemp4;
|
||||
|
||||
*(b_offset1 + 4) = ctemp5;
|
||||
*(b_offset1 + 5) = ctemp6;
|
||||
*(b_offset1 + 6) = ctemp7;
|
||||
*(b_offset1 + 7) = ctemp8;
|
||||
|
||||
b_offset1 += m * 4;
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
ctemp1 = *(a_offset1 + 0);
|
||||
ctemp2 = *(a_offset1 + 1);
|
||||
|
||||
ctemp3 = *(a_offset2 + 0);
|
||||
ctemp4 = *(a_offset2 + 1);
|
||||
|
||||
a_offset1 += 2;
|
||||
a_offset2 += 2;
|
||||
|
||||
*(b_offset2 + 0) = ctemp1;
|
||||
*(b_offset2 + 1) = ctemp2;
|
||||
*(b_offset2 + 2) = ctemp3;
|
||||
*(b_offset2 + 3) = ctemp4;
|
||||
|
||||
b_offset2 += 4;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
ctemp1 = *(a_offset1 + 0);
|
||||
ctemp2 = *(a_offset2 + 0);
|
||||
|
||||
*(b_offset3 + 0) = ctemp1;
|
||||
*(b_offset3 + 1) = ctemp2;
|
||||
b_offset3 += 2;
|
||||
}
|
||||
}
|
||||
|
||||
if (m & 1){
|
||||
a_offset1 = a_offset;
|
||||
b_offset1 = b_offset;
|
||||
|
||||
i = (n >> 2);
|
||||
if (i > 0){
|
||||
do{
|
||||
ctemp1 = *(a_offset1 + 0);
|
||||
ctemp2 = *(a_offset1 + 1);
|
||||
ctemp3 = *(a_offset1 + 2);
|
||||
ctemp4 = *(a_offset1 + 3);
|
||||
|
||||
a_offset1 += 4;
|
||||
|
||||
*(b_offset1 + 0) = ctemp1;
|
||||
*(b_offset1 + 1) = ctemp2;
|
||||
*(b_offset1 + 2) = ctemp3;
|
||||
*(b_offset1 + 3) = ctemp4;
|
||||
|
||||
b_offset1 += 4 * m;
|
||||
|
||||
i --;
|
||||
}while(i > 0);
|
||||
}
|
||||
|
||||
if (n & 2) {
|
||||
ctemp1 = *(a_offset1 + 0);
|
||||
ctemp2 = *(a_offset1 + 1);
|
||||
a_offset1 += 2;
|
||||
|
||||
*(b_offset2 + 0) = ctemp1;
|
||||
*(b_offset2 + 1) = ctemp2;
|
||||
}
|
||||
|
||||
if (n & 1) {
|
||||
ctemp1 = *(a_offset1 + 0);
|
||||
*(b_offset3 + 0) = ctemp1;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,214 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_kernel_2x2.c
|
||||
|
||||
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1;
|
||||
IFLOAT *ptrba,*ptrbb;
|
||||
|
||||
//fprintf(stderr, "gemm_kernel_2x2 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc);
|
||||
|
||||
FLOAT_V_T va0, va1, vb0, vb1;
|
||||
FLOAT_V_T vres0, vres1, vres2, vres3;
|
||||
FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3;
|
||||
FLOAT_V_T_M1 v_z0;
|
||||
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
size_t vl;
|
||||
|
||||
for (j = bn/2; j > 0; j--) {
|
||||
C0 = C;
|
||||
C1 = C0 + ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm/2; i > 0; i--) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
|
||||
|
||||
ptrba += vl*2;
|
||||
ptrbb += vl*2;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
C0 += 2;
|
||||
C1 += 2;
|
||||
}
|
||||
|
||||
if(bm & 1) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += vl*2;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
C0 += 1;
|
||||
C1 += 1;
|
||||
}
|
||||
|
||||
bb += (bk<<1);
|
||||
C += (ldc<<1);
|
||||
}
|
||||
|
||||
if(bn & 1) {
|
||||
C0 = C;
|
||||
ptrba = ba;
|
||||
for (i = bm/2; i > 0; i--) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
|
||||
ptrba += vl*2;
|
||||
ptrbb += vl;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
C0 += 2;
|
||||
}
|
||||
|
||||
if(bm & 1) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += vl;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
|
||||
C0 += 1;
|
||||
}
|
||||
|
||||
bb += (bk<<0);
|
||||
C += ldc;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,508 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m1(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m1()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m1_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m1
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m1
|
||||
#define VLSEG4_FLOAT vlseg4e32_v_f32m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m1
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m1
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m1_f32m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m1(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m1()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m1_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m1
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m1
|
||||
#define VLSEG4_FLOAT vlseg4e64_v_f64m1
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m1
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m1
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m1
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m1_f64m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
// Optimizes the implementation in ../generic/gemm_kernel_2x2.c
|
||||
|
||||
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alpha, IFLOAT* ba, IFLOAT* bb, FLOAT* C, BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*C2,*C3;
|
||||
IFLOAT *ptrba,*ptrbb;
|
||||
|
||||
//fprintf(stderr, "gemm_kernel_4x4 bm=%ld bn=%ld bk=%ld alpha=%f ldc=%ld\n", bm, bn, bk, alpha, ldc); // KU
|
||||
|
||||
FLOAT_V_T va0, va1, va2, va3;
|
||||
FLOAT_V_T vb0, vb1, vb2, vb3;
|
||||
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
|
||||
FLOAT_V_T vres8, vres9, vres10, vres11, vres12, vres13, vres14, vres15;
|
||||
FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3;
|
||||
FLOAT_V_T_M1 v_z0;
|
||||
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
size_t vl;
|
||||
|
||||
for (j = bn/4; j > 0; j--) {
|
||||
C0 = C;
|
||||
C1 = C0 + ldc;
|
||||
C2 = C1 + ldc;
|
||||
C3 = C2 + ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm/4; i > 0; i--) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres4 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres5 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres6 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres7 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres8 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres9 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres10 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres11 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres12 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres13 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres14 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres15 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
||||
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
|
||||
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
||||
vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl);
|
||||
vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl);
|
||||
|
||||
vres8 = VFMACCVV_FLOAT(vres8, va2, vb0, vl);
|
||||
vres9 = VFMACCVV_FLOAT(vres9, va3, vb0, vl);
|
||||
vres10 = VFMACCVV_FLOAT(vres10, va2, vb1, vl);
|
||||
vres11 = VFMACCVV_FLOAT(vres11, va3, vb1, vl);
|
||||
|
||||
vres12 = VFMACCVV_FLOAT(vres12, va2, vb2, vl);
|
||||
vres13 = VFMACCVV_FLOAT(vres13, va3, vb2, vl);
|
||||
vres14 = VFMACCVV_FLOAT(vres14, va2, vb3, vl);
|
||||
vres15 = VFMACCVV_FLOAT(vres15, va3, vb3, vl);
|
||||
|
||||
ptrba += vl*4;
|
||||
ptrbb += vl*4;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres8, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres9, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres10, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres11, v_z0, vlmax);
|
||||
C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres12, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres13, v_z0, vlmax);
|
||||
C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C2[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C2[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres14, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres15, v_z0, vlmax);
|
||||
C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C3[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C3[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
C0 += 4;
|
||||
C1 += 4;
|
||||
C2 += 4;
|
||||
C3 += 4;
|
||||
}
|
||||
|
||||
if(bm & 2) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres4 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres5 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres6 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres7 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
|
||||
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
||||
vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va0, vb3, vl);
|
||||
vres7 = VFMACCVV_FLOAT(vres7, va1, vb3, vl);
|
||||
|
||||
ptrba += vl*2;
|
||||
ptrbb += vl*4;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3, v_z0, vlmax);
|
||||
C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax);
|
||||
C2[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C2[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres6, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres7, v_z0, vlmax);
|
||||
C3[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C3[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
C0 += 2;
|
||||
C1 += 2;
|
||||
C2 += 2;
|
||||
C3 += 2;
|
||||
}
|
||||
|
||||
if(bm & 1) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += vl*4;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C2[0] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C3[0] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
C0 += 1;
|
||||
C1 += 1;
|
||||
C2 += 1;
|
||||
C3 += 1;
|
||||
}
|
||||
|
||||
bb += (bk<<2);
|
||||
C += (ldc<<2);
|
||||
}
|
||||
|
||||
if(bn & 2) {
|
||||
|
||||
C0 = C;
|
||||
C1 = C0 + ldc;
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm/4; i > 0; i--) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
vres4 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres5 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres6 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres7 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
||||
|
||||
vres4 = VFMACCVV_FLOAT(vres4, va0, vb1, vl);
|
||||
vres5 = VFMACCVV_FLOAT(vres5, va1, vb1, vl);
|
||||
vres6 = VFMACCVV_FLOAT(vres6, va2, vb1, vl);
|
||||
vres7 = VFMACCVV_FLOAT(vres7, va3, vb1, vl);
|
||||
|
||||
ptrba += vl*4;
|
||||
ptrbb += vl*2;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres4, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres5, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres6, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres7, v_z0, vlmax);
|
||||
C1[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C1[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C1[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
C0 += 4;
|
||||
C1 += 4;
|
||||
}
|
||||
|
||||
if(bm & 2) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
|
||||
|
||||
ptrba += vl*2;
|
||||
ptrbb += vl*2;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C1[0] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C1[1] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
C0 += 2;
|
||||
C1 += 2;
|
||||
}
|
||||
|
||||
if(bm & 1) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += vl*2;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[0] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
C0 += 1;
|
||||
C1 += 1;
|
||||
}
|
||||
|
||||
bb += (bk<<1);
|
||||
C += (ldc<<1);
|
||||
}
|
||||
|
||||
if(bn & 1) {
|
||||
C0 = C;
|
||||
ptrba = ba;
|
||||
for (i = bm/4; i > 0; i--) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
||||
|
||||
ptrba += vl*4;
|
||||
ptrbb += vl;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C0[2] += alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C0[3] += alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
C0 += 4;
|
||||
}
|
||||
|
||||
if(bm & 2) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
|
||||
ptrba += vl*2;
|
||||
ptrbb += vl;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] += alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
C0 += 2;
|
||||
}
|
||||
|
||||
if(bm & 1) {
|
||||
ptrbb = bb;
|
||||
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = bk; k > 0; k -= vl) {
|
||||
vl = VSETVL(k);
|
||||
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += vl;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0, v_z0, vlmax);
|
||||
C0[0] += alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
|
||||
C0 += 1;
|
||||
}
|
||||
|
||||
bb += (bk<<0);
|
||||
C += ldc;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
|
@ -1,342 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m4_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m4
|
||||
#define VLSEG_FLOAT vlseg2e32_v_f32m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m4
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m4
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m4_f32m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m4(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m4()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m4_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m4
|
||||
#define VLSEG_FLOAT vlseg2e64_v_f64m4
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m4
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m4
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m4
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m4_f64m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
|
||||
// Optimizes the implementation in ../generic/trmmkernel_2x2.c
|
||||
|
||||
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc
|
||||
#ifdef TRMMKERNEL
|
||||
,BLASLONG offset
|
||||
#endif
|
||||
)
|
||||
{
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
||||
BLASLONG off, temp;
|
||||
|
||||
FLOAT_V_T va0, va1, vb0, vb1;
|
||||
FLOAT_V_T vres0, vres1, vres2, vres3;
|
||||
FLOAT_V_T_M1 v_res, v_z0;
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vl;
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off = -offset;
|
||||
#else
|
||||
off = 0;
|
||||
#endif
|
||||
|
||||
for (j = bn/2; j > 0; j--)
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm/2; i > 0; i--)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || \
|
||||
(!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2;
|
||||
#else
|
||||
temp = off+2;
|
||||
#endif
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG_FLOAT(&va0, &va1, ptrba, vl);
|
||||
VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
||||
vres2 = VFMACCVV_FLOAT(vres2, va0, vb1, vl);
|
||||
vres3 = VFMACCVV_FLOAT(vres3, va1, vb1, vl);
|
||||
|
||||
ptrba += vl * 2;
|
||||
ptrbb += vl * 2;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax);
|
||||
C0[1] = alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vres2, v_z0, vlmax);
|
||||
C1[0] = alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vres3, v_z0, vlmax);
|
||||
C1[1] = alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
}
|
||||
|
||||
if (bm & 1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off;
|
||||
ptrbb = bb+off*2;
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1;
|
||||
#else
|
||||
temp = off+2;
|
||||
#endif
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
VLSEG_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += vl * 2;
|
||||
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax);
|
||||
C1[0] = alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 2;
|
||||
#endif
|
||||
ptrba += temp;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
|
||||
if (bn & 1)
|
||||
{
|
||||
C0 = C;
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
ptrba = ba;
|
||||
|
||||
for (i = bm/2; i > 0; i--)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off;
|
||||
#endif
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || \
|
||||
(!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2;
|
||||
#else
|
||||
temp = off+1;
|
||||
#endif
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
VLSEG_FLOAT(&va0, &va1, ptrba, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl);
|
||||
vres1 = VFMACCVV_FLOAT(vres1, vb0, va1, vl);
|
||||
|
||||
ptrba += vl * 2;
|
||||
ptrbb += vl;
|
||||
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vres1, v_z0, vlmax);
|
||||
C0[1] = alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || \
|
||||
(!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
C0 = C0+2;
|
||||
}
|
||||
|
||||
if (bm & 1)
|
||||
{
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off;
|
||||
ptrbb = bb+off;
|
||||
#endif
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off + 1;
|
||||
#else
|
||||
temp = off + 1;
|
||||
#endif
|
||||
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
|
||||
vres0 = VFMACCVV_FLOAT(vres0, vb0, va0, vl);
|
||||
ptrba += vl;
|
||||
ptrbb += vl;
|
||||
}
|
||||
v_res = VFREDSUMVS_FLOAT(v_res, vres0, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(v_res);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#ifdef LEFT
|
||||
temp -= 1;
|
||||
#else
|
||||
temp -= 1;
|
||||
#endif
|
||||
ptrba += temp;
|
||||
ptrbb += temp;
|
||||
#endif
|
||||
#ifdef LEFT
|
||||
off += 1;
|
||||
#endif
|
||||
C0 = C0+1;
|
||||
}
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1;
|
||||
#endif
|
||||
k = (bk<<0);
|
||||
bb = bb+k;
|
||||
C = C+ldc;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
|
@ -1,881 +0,0 @@
|
|||
/***************************************************************************
|
||||
Copyright (c) 2022, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
#include "common.h"
|
||||
#include <stdbool.h>
|
||||
|
||||
#if !defined(DOUBLE)
|
||||
#define VSETVL(n) vsetvl_e32m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e32m2()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e32m1()
|
||||
#define FLOAT_V_T vfloat32m2_t
|
||||
#define FLOAT_V_T_M1 vfloat32m1_t
|
||||
#define VLEV_FLOAT vle32_v_f32m2
|
||||
#define VLSEG4_FLOAT vlseg4e32_v_f32m2
|
||||
#define VLSEG2_FLOAT vlseg2e32_v_f32m2
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f32m2
|
||||
#define VFMUL_FLOAT vfmul_vv_f32m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f32m2
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f32m2
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f32m2_f32m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f32m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f32m1_f32
|
||||
#else
|
||||
#define VSETVL(n) vsetvl_e64m2(n)
|
||||
#define VSETVL_MAX vsetvlmax_e64m2()
|
||||
#define VSETVL_MAX_M1 vsetvlmax_e64m1()
|
||||
#define FLOAT_V_T vfloat64m2_t
|
||||
#define FLOAT_V_T_M1 vfloat64m1_t
|
||||
#define VLEV_FLOAT vle64_v_f64m2
|
||||
#define VLSEG4_FLOAT vlseg4e64_v_f64m2
|
||||
#define VLSEG2_FLOAT vlseg2e64_v_f64m2
|
||||
#define VFMVVF_FLOAT vfmv_v_f_f64m2
|
||||
#define VFMUL_FLOAT vfmul_vv_f64m2
|
||||
#define VFMACCVF_FLOAT vfmacc_vf_f64m2
|
||||
#define VFMACCVV_FLOAT vfmacc_vv_f64m2
|
||||
#define VFREDSUMVS_FLOAT vfredusum_vs_f64m2_f64m1
|
||||
#define VFMVVF_FLOAT_M1 vfmv_v_f_f64m1
|
||||
#define VFMVFS_FLOAT_M1 vfmv_f_s_f64m1_f64
|
||||
#endif
|
||||
|
||||
|
||||
// Optimizes the implementation in ../generic/trmmkernel_4x4.c
|
||||
|
||||
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alpha,FLOAT* ba,FLOAT* bb,FLOAT* C,BLASLONG ldc ,BLASLONG offset)
|
||||
{
|
||||
|
||||
BLASLONG i,j,k;
|
||||
FLOAT *C0,*C1,*C2,*C3,*ptrba,*ptrbb;
|
||||
|
||||
FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3;
|
||||
FLOAT_V_T_M1 vsum0, vsum1, vsum2, vsum3, v_z0;
|
||||
v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
||||
size_t vl;
|
||||
size_t vlmax = VSETVL_MAX;
|
||||
|
||||
FLOAT_V_T vres0_0;
|
||||
FLOAT_V_T vres0_1;
|
||||
FLOAT_V_T vres0_2;
|
||||
FLOAT_V_T vres0_3;
|
||||
|
||||
FLOAT_V_T vres1_0;
|
||||
FLOAT_V_T vres1_1;
|
||||
FLOAT_V_T vres1_2;
|
||||
FLOAT_V_T vres1_3;
|
||||
|
||||
FLOAT_V_T vres2_0;
|
||||
FLOAT_V_T vres2_1;
|
||||
FLOAT_V_T vres2_2;
|
||||
FLOAT_V_T vres2_3;
|
||||
|
||||
FLOAT_V_T vres3_0;
|
||||
FLOAT_V_T vres3_1;
|
||||
FLOAT_V_T vres3_2;
|
||||
FLOAT_V_T vres3_3;
|
||||
|
||||
BLASLONG off, temp;
|
||||
|
||||
bool left;
|
||||
bool transposed;
|
||||
bool backwards;
|
||||
|
||||
#ifdef LEFT
|
||||
left = true;
|
||||
#else
|
||||
left = false;
|
||||
#endif
|
||||
|
||||
#ifdef TRANSA
|
||||
transposed = true;
|
||||
#else
|
||||
transposed = false;
|
||||
#endif
|
||||
|
||||
backwards = left != transposed;
|
||||
|
||||
if (!left) {
|
||||
off = -offset;
|
||||
}
|
||||
|
||||
|
||||
for (j=0; j<bn/4; j+=1) // do blocks of the Mx4 loops
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
C2 = C1+ldc;
|
||||
C3 = C2+ldc;
|
||||
|
||||
|
||||
if (left) {
|
||||
off = offset;
|
||||
}
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i=0; i<bm/4; i+=1) // do blocks of 4x4
|
||||
{
|
||||
|
||||
ptrbb = bb;
|
||||
if (backwards)
|
||||
{
|
||||
ptrba += off*4; // number of values in A
|
||||
ptrbb += off*4; // number of values in B
|
||||
}
|
||||
|
||||
vres0_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_2 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_3 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
vres1_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_2 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_3 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
vres2_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres2_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres2_2 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres2_3 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
vres3_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres3_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres3_2 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres3_3 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
temp = backwards ? bk-off :
|
||||
left ? off + 4 : // number of values in A
|
||||
off + 4; // number of values in B
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
||||
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
||||
|
||||
vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
|
||||
vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
|
||||
vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl);
|
||||
vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl);
|
||||
|
||||
vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
|
||||
vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
|
||||
vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl);
|
||||
vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl);
|
||||
|
||||
vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl);
|
||||
vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl);
|
||||
vres2_2 = VFMACCVV_FLOAT(vres2_2, va2, vb2, vl);
|
||||
vres3_2 = VFMACCVV_FLOAT(vres3_2, va2, vb3, vl);
|
||||
|
||||
vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl);
|
||||
vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl);
|
||||
vres2_3 = VFMACCVV_FLOAT(vres2_3, va3, vb2, vl);
|
||||
vres3_3 = VFMACCVV_FLOAT(vres3_3, va3, vb3, vl);
|
||||
|
||||
ptrba += vl * 4;
|
||||
ptrbb += vl * 4;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax);
|
||||
C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres2_3, v_z0, vlmax);
|
||||
C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C2[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C2[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres3_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres3_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_3, v_z0, vlmax);
|
||||
C3[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C3[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C3[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C3[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
if (!backwards) {
|
||||
temp = bk-off;
|
||||
temp = left ? temp - 4 : // number of values in A
|
||||
temp - 4; // number of values in B
|
||||
|
||||
ptrba += temp*4; // number of values in A
|
||||
ptrbb += temp*4; // number of values in B
|
||||
}
|
||||
#ifdef LEFT
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+4;
|
||||
C1 = C1+4;
|
||||
C2 = C2+4;
|
||||
C3 = C3+4;
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 2 ) // do any 2x4 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*4;
|
||||
#endif
|
||||
|
||||
vres0_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
vres1_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
vres2_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres2_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
vres3_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres3_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2; // number of values in A
|
||||
#else
|
||||
temp = off+4; // number of values in B
|
||||
#endif
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
||||
|
||||
vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
|
||||
vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
|
||||
vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl);
|
||||
vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl);
|
||||
|
||||
vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
|
||||
vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
|
||||
vres2_1 = VFMACCVV_FLOAT(vres2_1, va1, vb2, vl);
|
||||
vres3_1 = VFMACCVV_FLOAT(vres3_1, va1, vb3, vl);
|
||||
|
||||
ptrba += vl * 2;
|
||||
ptrbb += vl * 4;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax);
|
||||
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres2_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres2_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres3_0, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_1, v_z0, vlmax);
|
||||
|
||||
C2[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C2[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C3[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C3[1] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2; // number of values in A
|
||||
#else
|
||||
temp -= 4; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*4;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
C2 = C2+2;
|
||||
C3 = C3+2;
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 1 ) // do any 1x4 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*1;
|
||||
ptrbb = bb + off*4;
|
||||
#endif
|
||||
|
||||
vres0_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres2_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres3_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1; // number of values in A
|
||||
#else
|
||||
temp = off+4; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
||||
|
||||
vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
|
||||
vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
|
||||
vres2_0 = VFMACCVV_FLOAT(vres2_0, va0, vb2, vl);
|
||||
vres3_0 = VFMACCVV_FLOAT(vres3_0, va0, vb3, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += vl * 4;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres2_0, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres3_0, v_z0, vlmax);
|
||||
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C2[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C3[0] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1; // number of values in A
|
||||
#else
|
||||
temp -= 4; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*1;
|
||||
ptrbb += temp*4;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
C2 = C2+1;
|
||||
C3 = C3+1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 4;
|
||||
#endif
|
||||
|
||||
k = (bk<<2);
|
||||
bb = bb+k;
|
||||
i = (ldc<<2);
|
||||
C = C+i;
|
||||
}
|
||||
|
||||
for (j=0; j<(bn&2); j+=2) // do the Mx2 loops
|
||||
{
|
||||
C0 = C;
|
||||
C1 = C0+ldc;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i=0; i<bm/4; i+=1) // do blocks of 4x2
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*4;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
vres0_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_2 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_3 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
vres1_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_2 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_3 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+4; // number of values in A
|
||||
#else
|
||||
temp = off+2; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
|
||||
vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
|
||||
|
||||
vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
|
||||
vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
|
||||
|
||||
vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl);
|
||||
vres1_2 = VFMACCVV_FLOAT(vres1_2, va2, vb1, vl);
|
||||
|
||||
vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl);
|
||||
vres1_3 = VFMACCVV_FLOAT(vres1_3, va3, vb1, vl);
|
||||
|
||||
ptrba += vl * 4;
|
||||
ptrbb += vl * 2;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres1_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_3, v_z0, vlmax);
|
||||
C1[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C1[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C1[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 4; // number of values in A
|
||||
#else
|
||||
temp -= 2; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*4;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+4;
|
||||
C1 = C1+4;
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 2 ) // do any 2x2 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
|
||||
vres0_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
vres1_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2; // number of values in A
|
||||
#else
|
||||
temp = off+2; // number of values in B
|
||||
#endif
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
|
||||
vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
|
||||
|
||||
vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
|
||||
vres1_1 = VFMACCVV_FLOAT(vres1_1, va1, vb1, vl);
|
||||
|
||||
ptrba += vl * 2;
|
||||
ptrbb += vl * 2;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres1_0, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres1_1, v_z0, vlmax);
|
||||
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C1[0] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C1[1] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2; // number of values in A
|
||||
#else
|
||||
temp -= 2; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+2;
|
||||
C1 = C1+2;
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 1 ) // do any 1x2 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*1;
|
||||
ptrbb = bb + off*2;
|
||||
#endif
|
||||
|
||||
|
||||
vres0_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres1_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1; // number of values in A
|
||||
#else
|
||||
temp = off+2; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
||||
|
||||
vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
|
||||
vres1_0 = VFMACCVV_FLOAT(vres1_0, va0, vb1, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += vl * 2;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres1_0, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C1[0] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1; // number of values in A
|
||||
#else
|
||||
temp -= 2; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*1;
|
||||
ptrbb += temp*2;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+1;
|
||||
C1 = C1+1;
|
||||
|
||||
}
|
||||
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 2;
|
||||
#endif
|
||||
|
||||
k = (bk<<1);
|
||||
bb = bb+k;
|
||||
i = (ldc<<1);
|
||||
C = C+i;
|
||||
}
|
||||
|
||||
for (j=0; j<(bn&1); j+=1) // do the Mx1 loops
|
||||
{
|
||||
C0 = C;
|
||||
|
||||
#if defined(TRMMKERNEL) && defined(LEFT)
|
||||
off = offset;
|
||||
#endif
|
||||
|
||||
ptrba = ba;
|
||||
|
||||
for (i=0; i<bm/4; i+=1) // do blocks of 4x1 loops
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*4;
|
||||
ptrbb = bb + off*1;
|
||||
#endif
|
||||
|
||||
vres0_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_2 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_3 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+4; // number of values in A
|
||||
#else
|
||||
temp = off+1; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
|
||||
vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
|
||||
|
||||
vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
|
||||
|
||||
vres0_2 = VFMACCVV_FLOAT(vres0_2, va2, vb0, vl);
|
||||
|
||||
vres0_3 = VFMACCVV_FLOAT(vres0_3, va3, vb0, vl);
|
||||
|
||||
ptrba += vl * 4;
|
||||
ptrbb += vl;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
|
||||
vsum2 = VFREDSUMVS_FLOAT(vsum2, vres0_2, v_z0, vlmax);
|
||||
vsum3 = VFREDSUMVS_FLOAT(vsum3, vres0_3, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
C0[2] = alpha * VFMVFS_FLOAT_M1(vsum2);
|
||||
C0[3] = alpha * VFMVFS_FLOAT_M1(vsum3);
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 4; // number of values in A
|
||||
#else
|
||||
temp -= 1; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*4;
|
||||
ptrbb += temp*1;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 4; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+4;
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 2 ) // do any 2x1 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*2;
|
||||
ptrbb = bb + off*1;
|
||||
#endif
|
||||
|
||||
vres0_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
vres0_1 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+2; // number of values in A
|
||||
#else
|
||||
temp = off+1; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
|
||||
vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
|
||||
|
||||
vres0_1 = VFMACCVV_FLOAT(vres0_1, va1, vb0, vl);
|
||||
|
||||
ptrba += vl * 2;
|
||||
ptrbb += vl;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
|
||||
vsum1 = VFREDSUMVS_FLOAT(vsum1, vres0_1, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
C0[1] = alpha * VFMVFS_FLOAT_M1(vsum1);
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 2; // number of values in A
|
||||
#else
|
||||
temp -= 1; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*2;
|
||||
ptrbb += temp*1;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 2; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+2;
|
||||
|
||||
}
|
||||
|
||||
if ( bm & 1 ) // do any 1x1 loop
|
||||
{
|
||||
|
||||
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
ptrbb = bb;
|
||||
#else
|
||||
ptrba += off*1;
|
||||
ptrbb = bb + off*1;
|
||||
#endif
|
||||
|
||||
vres0_0 = VFMVVF_FLOAT(0, vlmax);
|
||||
|
||||
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
||||
temp = bk-off;
|
||||
#elif defined(LEFT)
|
||||
temp = off+1; // number of values in A
|
||||
#else
|
||||
temp = off+1; // number of values in B
|
||||
#endif
|
||||
|
||||
for (k = temp; k > 0; k -= vl)
|
||||
{
|
||||
vl = VSETVL(k);
|
||||
va0 = VLEV_FLOAT(ptrba, vl);
|
||||
vb0 = VLEV_FLOAT(ptrbb, vl);
|
||||
|
||||
vres0_0 = VFMACCVV_FLOAT(vres0_0, va0, vb0, vl);
|
||||
|
||||
ptrba += vl;
|
||||
ptrbb += vl;
|
||||
}
|
||||
|
||||
vsum0 = VFREDSUMVS_FLOAT(vsum0, vres0_0, v_z0, vlmax);
|
||||
C0[0] = alpha * VFMVFS_FLOAT_M1(vsum0);
|
||||
|
||||
|
||||
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
||||
temp = bk - off;
|
||||
#ifdef LEFT
|
||||
temp -= 1; // number of values in A
|
||||
#else
|
||||
temp -= 1; // number of values in B
|
||||
#endif
|
||||
ptrba += temp*1;
|
||||
ptrbb += temp*1;
|
||||
#endif
|
||||
|
||||
#ifdef LEFT
|
||||
off += 1; // number of values in A
|
||||
#endif
|
||||
|
||||
C0 = C0+1;
|
||||
|
||||
}
|
||||
|
||||
#if defined(TRMMKERNEL) && !defined(LEFT)
|
||||
off += 1;
|
||||
#endif
|
||||
|
||||
k = (bk<<0);
|
||||
bb = bb+k;
|
||||
C = C+ldc;
|
||||
}
|
||||
return 0;
|
||||
}
|
Loading…
Reference in New Issue