597 lines
22 KiB
C
597 lines
22 KiB
C
/***************************************************************************
|
|
Copyright (c) 2022, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*****************************************************************************/
|
|
|
|
#include "common.h"
|
|
|
|
#if !defined(DOUBLE)
|
|
#define VSETVL(n) __riscv_vsetvl_e32m2(n)
|
|
#define VSETVL_MAX __riscv_vsetvlmax_e32m2()
|
|
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e32m1()
|
|
#define FLOAT_V_T vfloat32m2_t
|
|
#define FLOAT_V_T_M1 vfloat32m1_t
|
|
#define VLEV_FLOAT __riscv_vle32_v_f32m2
|
|
#define VLSEG4_FLOAT __riscv_vlseg4e32_v_f32m2
|
|
#define VLSEG2_FLOAT __riscv_vlseg2e32_v_f32m2
|
|
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f32m2
|
|
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f32m2
|
|
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f32m2
|
|
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f32m2
|
|
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f32m2_f32m1
|
|
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f32m1
|
|
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f32m1_f32
|
|
#else
|
|
#define VSETVL(n) __riscv_vsetvl_e64m2(n)
|
|
#define VSETVL_MAX __riscv_vsetvlmax_e64m2()
|
|
#define VSETVL_MAX_M1 __riscv_vsetvlmax_e64m1()
|
|
#define FLOAT_V_T vfloat64m2_t
|
|
#define FLOAT_V_T_M1 vfloat64m1_t
|
|
#define VLEV_FLOAT __riscv_vle64_v_f64m2
|
|
#define VLSEG4_FLOAT __riscv_vlseg4e64_v_f64m2
|
|
#define VLSEG2_FLOAT __riscv_vlseg2e64_v_f64m2
|
|
#define VFMVVF_FLOAT __riscv_vfmv_v_f_f64m2
|
|
#define VFMACCVF_FLOAT __riscv_vfmacc_vf_f64m2
|
|
#define VFMACCVV_FLOAT __riscv_vfmacc_vv_f64m2
|
|
#define VFNMSACVV_FLOAT __riscv_vfnmsac_vv_f64m2
|
|
#define VFREDSUMVS_FLOAT __riscv_vfredusum_vs_f64m2_f64m1
|
|
#define VFMVVF_FLOAT_M1 __riscv_vfmv_v_f_f64m1
|
|
#define VFMVFS_FLOAT_M1 __riscv_vfmv_f_s_f64m1_f64
|
|
#endif
|
|
|
|
// Optimizes the implementation in ../generic/ztrmmkernel_2x2.c
|
|
|
|
|
|
/********************************
|
|
ADD1 a*c
|
|
ADD2 b*c
|
|
ADD3 a*d
|
|
ADD4 b*d
|
|
*********************************/
|
|
int CNAME(BLASLONG bm,BLASLONG bn,BLASLONG bk,FLOAT alphar,FLOAT alphai,FLOAT* ba,FLOAT* bb,
|
|
FLOAT* C,BLASLONG ldc, BLASLONG offset)
|
|
{
|
|
BLASLONG i,j,k;
|
|
FLOAT *C0,*C1,*ptrba,*ptrbb;
|
|
FLOAT res0,res1;
|
|
BLASLONG off, temp;
|
|
|
|
FLOAT_V_T va0, va1, va2, va3, vb0, vb1, vb2, vb3;
|
|
FLOAT_V_T vres0, vres1, vres2, vres3, vres4, vres5, vres6, vres7;
|
|
FLOAT_V_T_M1 v_m1_res0, v_m1_res1;
|
|
FLOAT_V_T_M1 v_z0 = VFMVVF_FLOAT_M1(0, VSETVL_MAX_M1);
|
|
|
|
size_t vl;
|
|
size_t vlmax = VSETVL_MAX;
|
|
|
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
off = -offset;
|
|
#else
|
|
off = 0;
|
|
#endif
|
|
|
|
for (j = bn/2; j > 0; j--)
|
|
{
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
off = offset;
|
|
#endif
|
|
C0 = C;
|
|
C1 = C0+2*ldc;
|
|
ptrba = ba;
|
|
|
|
for (i = bm/2; i > 0; i--)
|
|
{
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
ptrbb = bb;
|
|
#else
|
|
ptrba += off*2*2;
|
|
ptrbb = bb+off*2*2;
|
|
#endif
|
|
|
|
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres4 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres5 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres6 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres7 = VFMVVF_FLOAT(0.0, vlmax);
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
temp = bk - off;
|
|
#elif defined(LEFT)
|
|
temp = off + 2;
|
|
#else
|
|
temp = off + 2;
|
|
#endif
|
|
|
|
for (k = temp; k > 0; k -= vl)
|
|
{
|
|
vl = VSETVL(k);
|
|
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
|
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
|
|
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
|
vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
|
|
|
|
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
|
vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
|
|
vres4 = VFNMSACVV_FLOAT(vres4, va1, vb3, vl);
|
|
vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
|
|
|
|
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
|
|
vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
|
|
vres6 = VFNMSACVV_FLOAT(vres6, va3, vb3, vl);
|
|
vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
|
|
#endif
|
|
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
|
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
|
|
|
|
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
|
vres5 = VFMACCVV_FLOAT(vres5, va1, vb2, vl);
|
|
vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
|
|
vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
|
|
|
|
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
|
|
vres7 = VFMACCVV_FLOAT(vres7, va3, vb2, vl);
|
|
vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
|
|
vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
|
|
#endif
|
|
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
|
|
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
|
|
|
|
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
|
vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
|
|
vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
|
|
vres5 = VFMACCVV_FLOAT(vres5, va0, vb3, vl);
|
|
|
|
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
|
|
vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
|
|
vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
|
|
vres7 = VFMACCVV_FLOAT(vres7, va2, vb3, vl);
|
|
#endif
|
|
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
|
|
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
|
|
|
|
vres4 = VFMACCVV_FLOAT(vres4, va0, vb2, vl);
|
|
vres5 = VFNMSACVV_FLOAT(vres5, va1, vb2, vl);
|
|
vres4 = VFMACCVV_FLOAT(vres4, va1, vb3, vl);
|
|
vres5 = VFNMSACVV_FLOAT(vres5, va0, vb3, vl);
|
|
|
|
vres6 = VFMACCVV_FLOAT(vres6, va2, vb2, vl);
|
|
vres7 = VFNMSACVV_FLOAT(vres7, va3, vb2, vl);
|
|
vres6 = VFMACCVV_FLOAT(vres6, va3, vb3, vl);
|
|
vres7 = VFNMSACVV_FLOAT(vres7, va2, vb3, vl);
|
|
|
|
#endif
|
|
ptrba += vl * 4;
|
|
ptrbb += vl * 4;
|
|
}
|
|
|
|
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
|
|
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
|
|
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
|
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
|
C0[0] = res0 * alphar - res1 * alphai;
|
|
C0[1] = res1 * alphar + res0 * alphai;
|
|
|
|
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
|
|
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
|
|
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
|
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
|
C0[2] = res0 * alphar - res1 * alphai;
|
|
C0[3] = res1 * alphar + res0 * alphai;
|
|
|
|
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres4, v_z0, vlmax);
|
|
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres5, v_z0, vlmax);
|
|
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
|
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
|
C1[0] = res0 * alphar - res1 * alphai;
|
|
C1[1] = res1 * alphar + res0 * alphai;
|
|
|
|
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres6, v_z0, vlmax);
|
|
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres7, v_z0, vlmax);
|
|
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
|
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
|
C1[2] = res0 * alphar - res1 * alphai;
|
|
C1[3] = res1 * alphar + res0 * alphai;
|
|
#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
(!defined(LEFT) && !defined(TRANSA))
|
|
temp = bk - off;
|
|
#ifdef LEFT
|
|
temp -= 2;
|
|
#else
|
|
temp -= 2;
|
|
#endif
|
|
|
|
ptrba += temp*2*2;
|
|
ptrbb += temp*2*2;
|
|
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
off += 2;
|
|
#endif
|
|
|
|
C0 = C0+4;
|
|
C1 = C1+4;
|
|
}
|
|
|
|
if (bm & 1)
|
|
{
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
ptrbb = bb;
|
|
#else
|
|
ptrba += off*2;
|
|
ptrbb = bb + off*2*2;
|
|
#endif
|
|
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
temp = bk - off;
|
|
#elif defined(LEFT)
|
|
temp = off+1;
|
|
#else
|
|
temp = off+2;
|
|
#endif
|
|
for (k = temp; k > 0; k -= vl)
|
|
{
|
|
vl = VSETVL(k);
|
|
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
|
VLSEG4_FLOAT(&vb0, &vb1, &vb2, &vb3, ptrbb, vl);
|
|
|
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
|
|
vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
|
|
#endif
|
|
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
|
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va1, vb2, vl);
|
|
vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
|
|
|
|
#endif
|
|
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
|
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
|
|
vres2 = VFMACCVV_FLOAT(vres2, va1, vb3, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va0, vb3, vl);
|
|
|
|
#endif
|
|
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
|
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va0, vb2, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va1, vb2, vl);
|
|
vres2 = VFNMSACVV_FLOAT(vres2, va1, vb3, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va0, vb3, vl);
|
|
|
|
#endif
|
|
ptrba += vl * 2;
|
|
ptrbb += vl * 4;
|
|
}
|
|
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
|
|
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
|
|
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
|
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
|
C0[0] = res0 * alphar - res1 * alphai;
|
|
C0[1] = res1 * alphar + res0 * alphai;
|
|
|
|
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
|
|
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
|
|
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
|
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
|
C1[0] = res0 * alphar - res1 * alphai;
|
|
C1[1] = res1 * alphar + res0 * alphai;
|
|
|
|
#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
(!defined(LEFT) && !defined(TRANSA))
|
|
temp = bk - off;
|
|
#ifdef LEFT
|
|
temp -= 1;
|
|
#else
|
|
temp -= 2;
|
|
#endif
|
|
ptrba += temp*2;
|
|
ptrbb += temp*2*2;
|
|
#endif
|
|
#ifdef LEFT
|
|
off += 1;
|
|
#endif
|
|
C0 = C0+2;
|
|
C1 = C1+2;
|
|
}
|
|
#if defined(TRMMKERNEL) && !defined(LEFT)
|
|
off += 2;
|
|
#endif
|
|
k = (bk<<2);
|
|
bb = bb+k;
|
|
i = (ldc<<2);
|
|
C = C+i;
|
|
}
|
|
|
|
if (bn & 1)
|
|
{
|
|
C0 = C;
|
|
#if defined(TRMMKERNEL) && defined(LEFT)
|
|
off = offset;
|
|
#endif
|
|
ptrba = ba;
|
|
|
|
for (i = bm/2; i > 0; i--)
|
|
{
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
ptrbb = bb;
|
|
#else
|
|
ptrba += off*2*2;
|
|
ptrbb = bb+off*2;
|
|
#endif
|
|
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres2 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres3 = VFMVVF_FLOAT(0.0, vlmax);
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
temp = bk - off;
|
|
#elif defined(LEFT)
|
|
temp = off + 2;
|
|
#else
|
|
temp = off + 1;
|
|
#endif
|
|
|
|
for (k = temp; k > 0; k -= vl)
|
|
{
|
|
vl = VSETVL(k);
|
|
VLSEG4_FLOAT(&va0, &va1, &va2, &va3, ptrba, vl);
|
|
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
|
|
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
|
vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
|
|
#endif
|
|
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va3, vb0, vl);
|
|
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
|
|
#endif
|
|
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
|
|
vres2 = VFMACCVV_FLOAT(vres2, va3, vb1, vl);
|
|
vres3 = VFMACCVV_FLOAT(vres3, va2, vb1, vl);
|
|
|
|
#endif
|
|
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
vres2 = VFMACCVV_FLOAT(vres2, va2, vb0, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va3, vb0, vl);
|
|
vres2 = VFNMSACVV_FLOAT(vres2, va3, vb1, vl);
|
|
vres3 = VFNMSACVV_FLOAT(vres3, va2, vb1, vl);
|
|
|
|
#endif
|
|
ptrba += vl * 4;
|
|
ptrbb += vl * 2;
|
|
}
|
|
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
|
|
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
|
|
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
|
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
|
C0[0] = res0 * alphar - res1 * alphai;
|
|
C0[1] = res1 * alphar + res0 * alphai;
|
|
|
|
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres2, v_z0, vlmax);
|
|
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres3, v_z0, vlmax);
|
|
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
|
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
|
C0[2] = res0 * alphar - res1 * alphai;
|
|
C0[3] = res1 * alphar + res0 * alphai;
|
|
|
|
#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
(!defined(LEFT) && !defined(TRANSA))
|
|
temp = bk-off;
|
|
#ifdef LEFT
|
|
temp -= 2;
|
|
#else
|
|
temp -= 1;
|
|
#endif
|
|
ptrba += temp*2*2;
|
|
ptrbb += temp*2;
|
|
#endif
|
|
#ifdef LEFT
|
|
off += 2;
|
|
#endif
|
|
C0 = C0+4;
|
|
}
|
|
|
|
if (bm & 1)
|
|
{
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
ptrbb = bb;
|
|
#else
|
|
ptrba += off*2;
|
|
ptrbb = bb + off*2;
|
|
#endif
|
|
vres0 = VFMVVF_FLOAT(0.0, vlmax);
|
|
vres1 = VFMVVF_FLOAT(0.0, vlmax);
|
|
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
temp = bk-off;
|
|
#elif defined(LEFT)
|
|
temp = off + 1;
|
|
#else
|
|
temp = off + 1;
|
|
#endif
|
|
|
|
for (k = temp; k > 0; k -= vl)
|
|
{
|
|
vl = VSETVL(k);
|
|
VLSEG2_FLOAT(&va0, &va1, ptrba, vl);
|
|
VLSEG2_FLOAT(&vb0, &vb1, ptrbb, vl);
|
|
|
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
#endif
|
|
#if defined(NR) || defined(NC) || defined(TR) || defined(TC)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
#endif
|
|
#if defined(RN) || defined(RT) || defined(CN) || defined(CT)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFMACCVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFMACCVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
#endif
|
|
#if defined(RR) || defined(RC) || defined(CR) || defined(CC)
|
|
vres0 = VFMACCVV_FLOAT(vres0, va0, vb0, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va1, vb0, vl);
|
|
vres0 = VFNMSACVV_FLOAT(vres0, va1, vb1, vl);
|
|
vres1 = VFNMSACVV_FLOAT(vres1, va0, vb1, vl);
|
|
|
|
#endif
|
|
ptrba += vl * 2;
|
|
ptrbb += vl * 2;
|
|
|
|
}
|
|
|
|
v_m1_res0 = VFREDSUMVS_FLOAT(v_m1_res0, vres0, v_z0, vlmax);
|
|
v_m1_res1 = VFREDSUMVS_FLOAT(v_m1_res1, vres1, v_z0, vlmax);
|
|
res0 = VFMVFS_FLOAT_M1(v_m1_res0);
|
|
res1 = VFMVFS_FLOAT_M1(v_m1_res1);
|
|
|
|
C0[0] = res0 * alphar - res1 * alphai;
|
|
C0[1] = res1 * alphar + res0 * alphai;
|
|
|
|
#if ( defined(LEFT) && defined(TRANSA)) || \
|
|
(!defined(LEFT) && !defined(TRANSA))
|
|
temp = bk - off;
|
|
#ifdef LEFT
|
|
temp -= 1;
|
|
#else
|
|
temp -= 1;
|
|
#endif
|
|
ptrba += temp*2;
|
|
ptrbb += temp*2;
|
|
|
|
#endif
|
|
#ifdef LEFT
|
|
off += 1;
|
|
#endif
|
|
C0 = C0+2;
|
|
}
|
|
k = (bk<<1);
|
|
bb = bb+k;
|
|
i = (ldc<<1);
|
|
C = C+i;
|
|
}
|
|
return 0;
|
|
}
|