Merge pull request #4504 from sergei-lewis/dev/slewis/ci

Add builds and unit tests for new RISCV platforms to CI
2024-02-16 22:48:28 +01:00 · 2024-02-16 22:48:28 +01:00 · ebbf5b3ea0
parent 5266998b9f 461ecabb22
commit ebbf5b3ea0
5 changed files with 311 additions and 4 deletions
--- a/.github/workflows/riscv64_vector.yml
+++ b/.github/workflows/riscv64_vector.yml
@ -0,0 +1,253 @@
+name: riscv64 zvl256b qemu test
+
+on: [push, pull_request]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
+  cancel-in-progress: true
+
+permissions:
+  contents: read # to fetch code (actions/checkout)
+
+jobs:
+  TEST:
+    if: "github.repository == 'OpenMathLib/OpenBLAS'"
+    runs-on: ubuntu-latest
+    env:
+      triple: riscv64-unknown-linux-gnu
+      riscv_gnu_toolchain: https://github.com/riscv-collab/riscv-gnu-toolchain
+      riscv_gnu_toolchain_version: 13.2.0
+      riscv_gnu_toolchain_nightly_download_path: /releases/download/2024.02.02/riscv64-glibc-ubuntu-22.04-llvm-nightly-2024.02.02-nightly.tar.gz
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - target: RISCV64_ZVL128B
+            opts: TARGET=RISCV64_ZVL128B BINARY=64 ARCH=riscv64
+            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=128,elen=64
+          - target: RISCV64_ZVL256B
+            opts: TARGET=RISCV64_ZVL256B BINARY=64 ARCH=riscv64
+            qemu_cpu: rv64,g=true,c=true,v=true,vext_spec=v1.0,vlen=256,elen=64
+
+    steps:
+      - name: Checkout repository
+        uses: actions/checkout@v3
+
+      - name: install build deps
+        run: |
+          sudo apt-get update
+          sudo apt-get install autoconf automake autotools-dev ninja-build make \
+          libgomp1-riscv64-cross ccache
+          wget ${riscv_gnu_toolchain}/${riscv_gnu_toolchain_nightly_download_path}
+          tar -xvf $(basename ${riscv_gnu_toolchain_nightly_download_path}) -C /opt
+
+      - name: Compilation cache
+        uses: actions/cache@v3
+        with:
+          path: ~/.ccache
+          key: ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}-${{ github.sha }}
+          restore-keys: |
+            ccache-${{ runner.os }}-${{ matrix.target }}-${{ github.ref }}
+            ccache-${{ runner.os }}-${{ matrix.target }}
+
+      - name: Configure ccache
+        run: |
+          test -d ~/.ccache || mkdir -p ~/.ccache
+          echo "max_size = 300M" > ~/.ccache/ccache.conf
+          echo "compression = true" >> ~/.ccache/ccache.conf
+          ccache -s
+
+      - name: build OpenBLAS libs
+        run: |
+            export PATH="/opt/riscv/bin:$PATH"
+            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
+            CC='ccache clang --rtlib=compiler-rt -target ${triple} --sysroot /opt/riscv/sysroot --gcc-toolchain=/opt/riscv/lib/gcc/riscv64-unknown-linux-gnu/${riscv_gnu_toolchain_version}/' \
+            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
+            RANLIB='ccache ${triple}-ranlib' \
+            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
+            HOSTCC=gcc HOSTFC=gfortran -j$(nproc)
+
+      - name: build OpenBLAS tests
+        run: |
+            export PATH="/opt/riscv/bin:$PATH"
+            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
+            CC='${triple}-gcc' \
+            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
+            RANLIB='ccache ${triple}-ranlib' \
+            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
+            HOSTCC=gcc HOSTFC=gfortran -j$(nproc) tests
+
+      - name: build lapack-netlib tests
+        working-directory: ./lapack-netlib/TESTING
+        run: |
+            export PATH="/opt/riscv/bin:$PATH"
+            make TARGET=${{ matrix.target }} CFLAGS="-DTARGET=${{ matrix.target }}" \
+            CC='${triple}-gcc' \
+            AR='ccache ${triple}-ar' AS='ccache ${triple}-gcc' LD='ccache ${triple}-gcc' \
+            RANLIB='ccache ${triple}-ranlib' \
+            FC='ccache ${triple}-gfortran' ${{ matrix.opts }} \
+            HOSTCC=gcc HOSTFC=gfortran -j$(nproc) \
+            LIN/xlintsts LIN/xlintstc LIN/xlintstd LIN/xlintstz LIN/xlintstrfs \
+            LIN/xlintstrfc LIN/xlintstrfd LIN/xlintstrfz LIN/xlintstds \
+            LIN/xlintstzc EIG/xeigtsts EIG/xeigtstc EIG/xeigtstd EIG/xeigtstz \
+
+      - name: OpenBLAS tests
+        shell: bash
+        run: |
+          export PATH="/opt/riscv/bin:$PATH"
+          export QEMU_CPU=${{ matrix.qemu_cpu }}
+          rm -rf ./test_out
+          mkdir -p ./test_out
+          run_test() { local DIR=$1; local CMD=$2; local DATA=$3; local OUTPUT="./test_out/$DIR.$CMD"; \
+            echo "`pwd`/$DIR/$CMD $DIR/$DATA" >> $OUTPUT; \
+            if [[ -z $DATA ]]; then qemu-riscv64 ./$DIR/$CMD |& tee $OUTPUT ; \ 
+            else qemu-riscv64 ./$DIR/$CMD < ./$DIR/$DATA |& tee $OUTPUT ; fi ; \
+            RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi \
+          }
+          run_test test cblat1 &
+          run_test test cblat2 cblat2.dat &
+          run_test test cblat3 cblat3.dat &
+          run_test test dblat1 &
+          run_test test dblat2 dblat2.dat &
+          run_test test dblat3 dblat3.dat &
+          run_test test sblat1 &
+          run_test test sblat2 sblat2.dat &
+          run_test test sblat3 sblat3.dat &
+          run_test test zblat1 &
+          run_test test zblat2 zblat2.dat &
+          run_test test zblat3 zblat3.dat &
+          run_test ctest xccblat1 &
+          run_test ctest xccblat2 cin2 &
+          run_test ctest xccblat3 cin3 &
+          run_test ctest xdcblat1 &
+          run_test ctest xdcblat2 din2 &
+          run_test ctest xdcblat3 din3 &
+          run_test ctest xscblat1 &
+          run_test ctest xscblat2 sin2 &
+          run_test ctest xscblat3 sin3 &
+          run_test ctest xzcblat1 &
+          run_test ctest xzcblat2 zin2 &
+          run_test ctest xzcblat3 zin3 &
+          wait
+          while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ; done < <(grep -lZ FAIL ./test_out/*)
+          if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
+
+      - name: netlib tests
+        shell: bash
+        run: |
+          : # these take a very long time
+          echo "Skipping netlib tests in CI"
+          exit 0
+          : # comment out exit above to enable the tests
+          : # probably we want to identify a subset to run in CI
+          export PATH="/opt/riscv/bin:$PATH"
+          export QEMU_CPU=${{ matrix.qemu_cpu }}
+          rm -rf ./test_out
+          mkdir -p ./test_out
+          run_test() { local OUTPUT="./test_out/$1"; local DATA="./lapack-netlib/TESTING/$2"; local CMD="./lapack-netlib/TESTING/$3"; \
+            echo "$4" >> $OUTPUT; \
+            echo "$CMD" >> $OUTPUT; \
+            qemu-riscv64 $CMD < $DATA |& tee $OUTPUT; \
+            RV=$? ; if [[ $RV != 0 ]]; then echo "*** FAIL: nonzero exit code $RV" >> $OUTPUT ; fi; \
+            if grep -q fail $OUTPUT ; then echo "*** FAIL: log contains 'fail'" >> $OUTPUT ; fi ; \
+            if grep -q rror $OUTPUT | grep -v -q "passed" | grep -v "largest error" ; then echo "*** FAIL: log contains 'error'" >> $OUTPUT ; fi \
+          }
+          run_test stest.out stest.in LIN/xlintsts "Testing REAL LAPACK linear equation routines"  &
+          run_test ctest.out ctest.in LIN/xlintstc "Testing COMPLEX LAPACK linear equation routines"  &
+          run_test dtest.out dtest.in LIN/xlintstd "Testing DOUBLE PRECISION LAPACK linear equation routines"  &
+          run_test ztest.out ztest.in LIN/xlintstz "Testing COMPLEX16 LAPACK linear equation routines"  &
+          run_test dstest.out dstest.in LIN/xlintstds "Testing SINGLE-DOUBLE PRECISION LAPACK prototype linear equation routines"  &
+          run_test zctest.out zctest.in LIN/xlintstzc "Testing COMPLEX-COMPLEX16 LAPACK prototype linear equation routines"  &
+          run_test stest_rfp.out stest_rfp.in LIN/xlintstrfs "Testing REAL LAPACK RFP prototype linear equation routines"  &
+          run_test dtest_rfp.out dtest_rfp.in LIN/xlintstrfd "Testing DOUBLE PRECISION LAPACK RFP prototype linear equation routines"  &
+          run_test ctest_rfp.out ctest_rfp.in LIN/xlintstrfc "Testing COMPLEX LAPACK RFP prototype linear equation routines"  &
+          run_test ztest_rfp.out ztest_rfp.in LIN/xlintstrfz "Testing COMPLEX16 LAPACK RFP prototype linear equation routines"  &
+          run_test snep.out nep.in EIG/xeigtsts "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test ssep.out sep.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test sse2.out se2.in EIG/xeigtsts "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test ssvd.out svd.in EIG/xeigtsts "SVD - Testing Singular Value Decomposition routines"  &
+          run_test sec.out sec.in EIG/xeigtsts "SEC - Testing REAL Eigen Condition Routines"  &
+          run_test sed.out sed.in EIG/xeigtsts "SEV - Testing REAL Nonsymmetric Eigenvalue Driver"  &
+          run_test sgg.out sgg.in EIG/xeigtsts "SGG - Testing REAL Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test sgd.out sgd.in EIG/xeigtsts "SGD - Testing REAL Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test ssb.out ssb.in EIG/xeigtsts "SSB - Testing REAL Symmetric Eigenvalue Problem routines"  &
+          run_test ssg.out ssg.in EIG/xeigtsts "SSG - Testing REAL Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test sbal.out sbal.in EIG/xeigtsts "SGEBAL - Testing the balancing of a REAL general matrix"  &
+          run_test sbak.out sbak.in EIG/xeigtsts "SGEBAK - Testing the back transformation of a REAL balanced matrix"  &
+          run_test sgbal.out sgbal.in EIG/xeigtsts "SGGBAL - Testing the balancing of a pair of REAL general matrices"  &
+          run_test sgbak.out sgbak.in EIG/xeigtsts "SGGBAK - Testing the back transformation of a pair of REAL balanced matrices"  &
+          run_test sbb.out sbb.in EIG/xeigtsts "SBB - Testing banded Singular Value Decomposition routines"  &
+          run_test sglm.out glm.in EIG/xeigtsts "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test sgqr.out gqr.in EIG/xeigtsts "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test sgsv.out gsv.in EIG/xeigtsts "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test scsd.out csd.in EIG/xeigtsts "CSD - Testing CS Decomposition routines"  &
+          run_test slse.out lse.in EIG/xeigtsts "LSE - Testing Constrained Linear Least Squares routines"  &
+          run_test cnep.out nep.in EIG/xeigtstc "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test csep.out sep.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test cse2.out se2.in EIG/xeigtstc "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test csvd.out svd.in EIG/xeigtstc "SVD - Testing Singular Value Decomposition routines"  &
+          run_test cec.out cec.in EIG/xeigtstc "CEC - Testing COMPLEX Eigen Condition Routines"  &
+          run_test ced.out ced.in EIG/xeigtstc "CES - Testing COMPLEX Nonsymmetric Schur Form Driver"  &
+          run_test cgg.out cgg.in EIG/xeigtstc "CGG - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test cgd.out cgd.in EIG/xeigtstc "CGD - Testing COMPLEX Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test csb.out csb.in EIG/xeigtstc "CHB - Testing Hermitian Eigenvalue Problem routines"  &
+          run_test csg.out csg.in EIG/xeigtstc "CSG - Testing Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test cbal.out cbal.in EIG/xeigtstc "CGEBAL - Testing the balancing of a COMPLEX general matrix"  &
+          run_test cbak.out cbak.in EIG/xeigtstc "CGEBAK - Testing the back transformation of a COMPLEX balanced matrix"  &
+          run_test cgbal.out cgbal.in EIG/xeigtstc "CGGBAL - Testing the balancing of a pair of COMPLEX general matrices"  &
+          run_test cgbak.out cgbak.in EIG/xeigtstc "CGGBAK - Testing the back transformation of a pair of COMPLEX balanced matrices"  &
+          run_test cbb.out cbb.in EIG/xeigtstc "CBB - Testing banded Singular Value Decomposition routines"  &
+          run_test cglm.out glm.in EIG/xeigtstc "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test cgqr.out gqr.in EIG/xeigtstc "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test cgsv.out gsv.in EIG/xeigtstc "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test ccsd.out csd.in EIG/xeigtstc "CSD - Testing CS Decomposition routines"  &
+          run_test clse.out lse.in EIG/xeigtstc "LSE - Testing Constrained Linear Least Squares routines"  &
+          run_test dnep.out nep.in EIG/xeigtstd "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test dsep.out sep.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test dse2.out se2.in EIG/xeigtstd "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test dsvd.out svd.in EIG/xeigtstd "SVD - Testing Singular Value Decomposition routines"  &
+          run_test dec.out dec.in EIG/xeigtstd "DEC - Testing DOUBLE PRECISION Eigen Condition Routines"  &
+          run_test ded.out ded.in EIG/xeigtstd "DEV - Testing DOUBLE PRECISION Nonsymmetric Eigenvalue Driver"  &
+          run_test dgg.out dgg.in EIG/xeigtstd "DGG - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test dgd.out dgd.in EIG/xeigtstd "DGD - Testing DOUBLE PRECISION Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test dsb.out dsb.in EIG/xeigtstd "DSB - Testing DOUBLE PRECISION Symmetric Eigenvalue Problem routines"  &
+          run_test dsg.out dsg.in EIG/xeigtstd "DSG - Testing DOUBLE PRECISION Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test dbal.out dbal.in EIG/xeigtstd "DGEBAL - Testing the balancing of a DOUBLE PRECISION general matrix"  &
+          run_test dbak.out dbak.in EIG/xeigtstd "DGEBAK - Testing the back transformation of a DOUBLE PRECISION balanced matrix"  &
+          run_test dgbal.out dgbal.in EIG/xeigtstd "DGGBAL - Testing the balancing of a pair of DOUBLE PRECISION general matrices"  &
+          run_test dgbak.out dgbak.in EIG/xeigtstd "DGGBAK - Testing the back transformation of a pair of DOUBLE PRECISION balanced matrices"  &
+          run_test dbb.out dbb.in EIG/xeigtstd "DBB - Testing banded Singular Value Decomposition routines"  &
+          run_test dglm.out glm.in EIG/xeigtstd "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test dgqr.out gqr.in EIG/xeigtstd "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test dgsv.out gsv.in EIG/xeigtstd "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test dcsd.out csd.in EIG/xeigtstd "CSD - Testing CS Decomposition routines"  &
+          run_test dlse.out lse.in EIG/xeigtstd "LSE - Testing Constrained Linear Least Squares routines"  &
+          run_test znep.out nep.in EIG/xeigtstz "NEP - Testing Nonsymmetric Eigenvalue Problem routines"  &
+          run_test zsep.out sep.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test zse2.out se2.in EIG/xeigtstz "SEP - Testing Symmetric Eigenvalue Problem routines"  &
+          run_test zsvd.out svd.in EIG/xeigtstz "SVD - Testing Singular Value Decomposition routines"  &
+          run_test zec.out zec.in EIG/xeigtstz "ZEC - Testing COMPLEX16 Eigen Condition Routines"  &
+          run_test zed.out zed.in EIG/xeigtstz "ZES - Testing COMPLEX16 Nonsymmetric Schur Form Driver"  &
+          run_test zgg.out zgg.in EIG/xeigtstz "ZGG - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem routines"  &
+          run_test zgd.out zgd.in EIG/xeigtstz "ZGD - Testing COMPLEX16 Nonsymmetric Generalized Eigenvalue Problem driver routines"  &
+          run_test zsb.out zsb.in EIG/xeigtstz "ZHB - Testing Hermitian Eigenvalue Problem routines"  &
+          run_test zsg.out zsg.in EIG/xeigtstz "ZSG - Testing Symmetric Generalized Eigenvalue Problem routines"  &
+          run_test zbal.out zbal.in EIG/xeigtstz "ZGEBAL - Testing the balancing of a COMPLEX16 general matrix"  &
+          run_test zbak.out zbak.in EIG/xeigtstz "ZGEBAK - Testing the back transformation of a COMPLEX16 balanced matrix"  &
+          run_test zgbal.out zgbal.in EIG/xeigtstz "ZGGBAL - Testing the balancing of a pair of COMPLEX general matrices"  &
+          run_test zgbak.out zgbak.in EIG/xeigtstz "ZGGBAK - Testing the back transformation of a pair of COMPLEX16 balanced matrices"  &
+          run_test zbb.out zbb.in EIG/xeigtstz "ZBB - Testing banded Singular Value Decomposition routines"  &
+          run_test zglm.out glm.in EIG/xeigtstz "GLM - Testing Generalized Linear Regression Model routines"  &
+          run_test zgqr.out gqr.in EIG/xeigtstz "GQR - Testing Generalized QR and RQ factorization routines"  &
+          run_test zgsv.out gsv.in EIG/xeigtstz "GSV - Testing Generalized Singular Value Decomposition routines"  &
+          run_test zcsd.out csd.in EIG/xeigtstz "CSD - Testing CS Decomposition routines"  &
+          run_test zlse.out lse.in EIG/xeigtstz "LSE - Testing Constrained Linear Least Squares routines"  &
+          wait
+          while IFS= read -r -d $'\0' LOG; do cat $LOG ; FAILURES=1 ;  done < <(grep -lZ FAIL ./test_out/*)
+          python ./lapack-netlib/lapack_testing.py -d ./test_out -e > netlib_summary
+          TOTALS="$(grep 'ALL PRECISIONS' netlib_summary)"
+          NUMERICAL_ERRORS=-1
+          OTHER_ERRORS=-1
+          . <(awk '/ALL PRECISIONS/{printf "NUMERICAL_ERRORS=%s\nOTHER_ERRORS=%s\n", $5, $7}' netlib_summary
+          if (( NUMERICAL_ERRORS != 0 )) || (( OTHER_ERRORS != 0 )) ; then cat netlib_summary ; FAILURES=1 ; fi
+          if [[ ! -z $FAILURES ]]; then echo "==========" ; echo "== FAIL ==" ; echo "==========" ; echo ; exit 1 ; fi
--- a/README.md
+++ b/README.md
@ -203,6 +203,16 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
  make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
  ```

+- **ZVL???B**: Level-3 BLAS and Level-1,2 including vectorised kernels targeting generic RISCV cores with vector support with registers of at least the corresponding width; ZVL128B and ZVL256B are available.
+e.g.:
+  ```sh
+make TARGET=RISCV64_ZVL256B CFLAGS="-DTARGET=RISCV64_ZVL256B" \
+    BINARY=64 ARCH=riscv64 CC='clang -target riscv64-unknown-linux-gnu' \
+    AR=riscv64-unknown-linux-gnu-ar AS=riscv64-unknown-linux-gnu-gcc \
+    LD=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran \
+    HOSTCC=gcc HOSTFC=gfortran -j
+  ```
+
 ### Support for multiple targets in a single library

 OpenBLAS can be built for multiple targets with runtime detection of the target cpu by specifiying `DYNAMIC_ARCH=1` in Makefile.rule, on the gmake command line or as `-DDYNAMIC_ARCH=TRUE` in cmake.
--- a/common_riscv64.h
+++ b/common_riscv64.h
@ -91,7 +91,7 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define BUFFER_SIZE     ( 32 << 20)
 #define SEEK_ADDRESS

-#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280)
+#if defined(C910V) || defined(RISCV64_ZVL256B) || defined(RISCV64_ZVL128B) || defined(x280)
 # include <riscv_vector.h>
 #endif

--- a/kernel/riscv64/axpy_rvv.c
+++ b/kernel/riscv64/axpy_rvv.c
@ -30,19 +30,29 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #if !defined(DOUBLE)
 #define VSETVL(n)               __riscv_vsetvl_e32m8(n)
 #define FLOAT_V_T               vfloat32m8_t
+#define FLOAT_V_M1_T            vfloat32m1_t
 #define VLEV_FLOAT              __riscv_vle32_v_f32m8
 #define VLSEV_FLOAT             __riscv_vlse32_v_f32m8
 #define VSEV_FLOAT              __riscv_vse32_v_f32m8
+#define VSEV_FLOAT_M1           __riscv_vse32_v_f32m1
 #define VSSEV_FLOAT             __riscv_vsse32_v_f32m8
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f32m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f32m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f32m8_f32m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f32m1
 #else
 #define VSETVL(n)               __riscv_vsetvl_e64m8(n)
 #define FLOAT_V_T               vfloat64m8_t
+#define FLOAT_V_M1_T            vfloat64m1_t
 #define VLEV_FLOAT              __riscv_vle64_v_f64m8
 #define VLSEV_FLOAT             __riscv_vlse64_v_f64m8
 #define VSEV_FLOAT              __riscv_vse64_v_f64m8
+#define VSEV_FLOAT_M1           __riscv_vse64_v_f64m1
 #define VSSEV_FLOAT             __riscv_vsse64_v_f64m8
 #define VFMACCVF_FLOAT          __riscv_vfmacc_vf_f64m8
+#define VFMVVF_FLOAT            __riscv_vfmv_v_f_f64m8
+#define VFREDSUMVS_FLOAT        __riscv_vfredusum_vs_f64m8_f64m1
+#define VFMVVF_FLOAT_M1         __riscv_vfmv_v_f_f64m1
 #endif

 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
@ -76,7 +86,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
            VSEV_FLOAT(y, vy, vl);
        }

-    } else if (1 == inc_x) {
+    } else if (1 == inc_x && 0 != inc_y) {

        BLASLONG stride_y = inc_y * sizeof(FLOAT);

@ -89,8 +99,20 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
            VSSEV_FLOAT(y, stride_y, vy, vl);
        }

-    } else {
+    } else if( 0 == inc_y ) {
+        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+        size_t in_vl = VSETVL(n);
+        vy = VFMVVF_FLOAT( y[0], in_vl );

+        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+            vl = VSETVL(n);
+            vx = VLSEV_FLOAT(x, stride_x, vl);
+            vy = VFMACCVF_FLOAT(vy, da, vx, vl);
+        }
+        FLOAT_V_M1_T vres = VFMVVF_FLOAT_M1( 0.0f, 1 );
+        vres = VFREDSUMVS_FLOAT( vy, vres, in_vl );
+        VSEV_FLOAT_M1(y, vres, 1);
+    } else {
        BLASLONG stride_x = inc_x * sizeof(FLOAT);
        BLASLONG stride_y = inc_y * sizeof(FLOAT);

--- a/kernel/riscv64/axpy_vector.c
+++ b/kernel/riscv64/axpy_vector.c
@ -51,11 +51,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define VSETVL          JOIN(RISCV_RVV(vsetvl),    _e,     ELEN,   LMUL,   _)
 #define FLOAT_V_T       JOIN(vfloat,    	ELEN,   LMUL,   _t,     _)
+#define FLOAT_V_M1_T    JOIN(vfloat,    	ELEN,   m1,   _t,     _)
 #define VLEV_FLOAT      JOIN(RISCV_RVV(vle),       ELEN,   _v_f,   ELEN,   LMUL)
 #define VLSEV_FLOAT     JOIN(RISCV_RVV(vlse),      ELEN,   _v_f,   ELEN,   LMUL)
 #define VSEV_FLOAT      JOIN(RISCV_RVV(vse),       ELEN,   _v_f,   ELEN,   LMUL)
 #define VSSEV_FLOAT     JOIN(RISCV_RVV(vsse),      ELEN,   _v_f,   ELEN,   LMUL)
 #define VFMACCVF_FLOAT  JOIN(RISCV_RVV(vfmacc),    _vf_f, 	ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT    JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   LMUL,   _)
+#define VFMVVF_FLOAT_M1 JOIN(RISCV_RVV(vfmv),      _v_f_f, ELEN,   m1,     _)
+
+#ifdef RISCV_0p10_INTRINSICS
+#define VFREDSUMVS_FLOAT(va, vb, gvl) JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))(v_res, va, vb, gvl)
+#else
+#define VFREDSUMVS_FLOAT JOIN(RISCV_RVV(vfredusum_vs_f),  ELEN,   LMUL,   _f, JOIN2( ELEN,   m1))
+#endif

 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
@ -123,7 +132,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			VSEV_FLOAT(&y[j], vy0, gvl);
 			j += gvl;
 		}
-        }else if(inc_x == 1){
+	} else if (1 == inc_x && 0 != inc_y) {
 		stride_y = inc_y * sizeof(FLOAT);
                gvl = VSETVL(n);
                if(gvl <= n/2){
@ -151,6 +160,19 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			VSSEV_FLOAT(&y[j*inc_y], stride_y, vy0, gvl);
 			j += gvl;
 		}
+	} else if( 0 == inc_y ) {
+	        BLASLONG stride_x = inc_x * sizeof(FLOAT);
+	        size_t in_vl = VSETVL(n);
+	        vy0 = VFMVVF_FLOAT( y[0], in_vl );
+
+	        for (size_t vl; n > 0; n -= vl, x += vl*inc_x) {
+	            vl = VSETVL(n);
+	            vx0 = VLSEV_FLOAT(x, stride_x, vl);
+	            vy0 = VFMACCVF_FLOAT(vy0, da, vx0, vl);
+	        }
+	        FLOAT_V_M1_T v_res = VFMVVF_FLOAT_M1( 0.0f, 1 );
+	        v_res = VFREDSUMVS_FLOAT( vy0, v_res, in_vl );
+	        y[0] = EXTRACT_FLOAT(v_res);
 	}else{
 		stride_x = inc_x * sizeof(FLOAT);
 		stride_y = inc_y * sizeof(FLOAT);