Merge branch 'develop' into win_tidy

2024-02-12 10:23:17 -08:00 · 2024-02-12 10:23:17 -08:00 · b29fd48998
parent 0a7ae326d2 b1ae777afb
commit b29fd48998
621 changed files with 96981 additions and 21930 deletions
--- a/.github/workflows/c910v.yml
+++ b/.github/workflows/c910v.yml
@ -14,8 +14,8 @@ jobs:
    if: "github.repository == 'OpenMathLib/OpenBLAS'"
    runs-on: ubuntu-latest
    env:
-      xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1663142514282
-      toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1-20220906.tar.gz
+      xuetie_toolchain: https://occ-oss-prod.oss-cn-hangzhou.aliyuncs.com/resource//1698113812618
+      toolchain_file_name: Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0-20231018.tar.gz
    strategy:
      fail-fast: false
      matrix:
@ -76,7 +76,7 @@ jobs:
        run: |
            wget ${xuetie_toolchain}/${toolchain_file_name}
            tar -xvf ${toolchain_file_name} -C /opt
-            export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.6.1/bin:$PATH"
+            export PATH="/opt/Xuantie-900-gcc-linux-5.10.4-glibc-x86_64-V2.8.0/bin:$PATH"

            make CC='ccache ${{ matrix.triple }}-gcc -static' FC='ccache ${{ matrix.triple }}-gfortran -static' ${{ matrix.opts }} HOSTCC='ccache gcc' -j$(nproc)

--- a/.github/workflows/loongarch64.yml
+++ b/.github/workflows/loongarch64.yml
@ -16,13 +16,13 @@ jobs:
        include:
          - target: LOONGSONGENERIC
            triple:  loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 TARGET=LOONGSONGENERIC
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSONGENERIC
          - target: LOONGSON3R5
            triple: loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 TARGET=LOONGSON3R5
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON3R5
          - target: LOONGSON2K1000
            triple: loongarch64-unknown-linux-gnu
-            opts: NO_SHARED=1 TARGET=LOONGSON2K1000
+            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=LOONGSON2K1000
          - target: DYNAMIC_ARCH
            triple: loongarch64-unknown-linux-gnu
            opts: NO_SHARED=1 DYNAMIC_ARCH=1 TARGET=GENERIC
@ -40,8 +40,9 @@ jobs:

      - name: Download and install loongarch64-toolchain
        run: |
-          wget https://github.com/loongson/build-tools/releases/download/2022.09.06/loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz
-          tar -xf loongarch64-clfs-7.3-cross-tools-gcc-glibc.tar.xz -C /opt
+          wget https://github.com/sunhaiyong1978/CLFS-for-LoongArch/releases/download/8.1/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
+          #wget https://github.com/loongson/build-tools/releases/download/2023.08.08/CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz
+          tar -xf CLFS-loongarch64-8.1-x86_64-cross-tools-gcc-glibc.tar.xz -C /opt

      - name: Set env
        run: |
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,7 +8,7 @@ project(OpenBLAS C ASM)

 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 25.dev)
+set(OpenBLAS_PATCH_VERSION 26.dev)

 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

@ -256,15 +256,15 @@ if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
  set (CMAKE_Fortran_USE_RESPONSE_FILE_FOR_OBJECTS 1)
  set (CMAKE_Fortran_CREATE_SHARED_LIBRARY
 "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
- "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+ "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
 "sh -c 'echo \"\" | ${CMAKE_Fortran_COMPILER} -o dummy.o -c -x f95-cpp-input - '"
- "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
+ "sh -c '${CMAKE_Fortran_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load dummy.o -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'"
 "sh -c 'ls -l ${CMAKE_BINARY_DIR}/lib'")
  else ()
  set (CMAKE_C_CREATE_SHARED_LIBRARY
   "sh -c 'cat ${CMAKE_BINARY_DIR}/CMakeFiles/openblas_shared.dir/objects*.rsp | xargs -n 1024 ${CMAKE_AR} -ru libopenblas.a && exit 0' "
-   "sh -c '${CMAKE_AR} -ru libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
-   "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -undefined dynamic_lookup -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
+   "sh -c '${CMAKE_AR} -rs libopenblas.a ${CMAKE_BINARY_DIR}/driver/others/CMakeFiles/driver_others.dir/xerbla.c.o && exit 0' "
+   "sh -c '${CMAKE_C_COMPILER} -fpic -shared -Wl,-all_load -Wl,-force_load,libopenblas.a -Wl,-noall_load -o ${CMAKE_LIBRARY_OUTPUT_DIRECTORY}/libopenblas.${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.dylib'")
  endif ()
 endif()

--- a/CONTRIBUTORS.md
+++ b/CONTRIBUTORS.md
@ -220,4 +220,6 @@ In chronological order:
 * Mark Seminatore <https://github.com/mseminatore>
  * [2023-11-09] Improve Windows threading performance scaling
  * [2024-02-09] Introduce MT_TRACE facility and improve code consistency
-  
+
+* Dirreke <https://github.com/mseminatore>
+  * [2024-01-16] Add basic support for the CSKY architecture
--- a/Changelog.txt
+++ b/Changelog.txt
@ -1,4 +1,49 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.26
+ 2-Jan-2024
+
+general:
+- improved the version of openblas.pc that is created by the CMAKE build
+- fixed a CMAKE-specific build problem on older versions of MacOS
+- worked around linking problems on old versions of MacOS
+- corrected installation location of the lapacke_mangling header in CMAKE builds
+- added type declarations for complex variables to the MSVC-specific parts of the LAPACK header
+- significantly sped up ?GESV for small problem sizes by introducing a lower bound for multithreading
+- imported additions and corrections from the Reference-LAPACK project:
+  - added new LAPACK functions for truncated QR with pivoting (Reference-LAPACK PRs 891&941)
+  - handle miscalculation of minimum work array size in corner cases (Reference-LAPACK PR 942)
+  - fixed use of uninitialized variables in ?GEDMD and improved inline documentation (PR 959)
+  - fixed use of uninitialized variables (and consequential failures) in ?BBCSD (PR 967)
+  - added tests for the recently introduced Dynamic Mode Decomposition functions (PR 736)
+  - fixed several memory leaks in the LAPACK testsuite (PR 953)
+  - fixed counting of testsuite results by the Python script (PR 954)
+   
+x86-64:
+- fixed computation of CASUM on SkylakeX and newer targets in the special
+  case that AVX512 is not supported by the compiler or operating environment
+- fixed potential undefined behaviour in the CASUM/ZASUM kernels for AVX512 targets
+- worked around a problem in the pre-AVX kernels for GEMV
+- sped up the thread management code on MS Windows
+
+arm64:
+- fixed building of the LAPACK testsuite with Xcode 15 on Apple M1 and newer
+- sped up the thread management code on MS Windows
+- sped up SGEMM and DGEMM on Neoverse V1 and N1
+- sped up ?DOT on SVE-capable targets
+- reduced the number of targets in DYNAMIC_ARCH builds by eliminating functionally equivalent ones
+- included support for Apple M1 and newer targets in DYNAMIC_ARCH builds
+
+power:
+- improved the SGEMM kernel for POWER10
+- fixed compilation with (very) old versions of gcc
+- fixed detection of old 32bit PPC targets in CMAKE-based builds
+- added autodetection of the POWERPC 7400 subtype
+- fixed CMAKE-based compilation for PPCG4 and PPC970 targets
+
+loongarch64:
+- added and improved optimized kernels for almost all BLAS functions
+
 ====================================================================
 Version 0.3.25
 12-Nov-2023
--- a/Makefile.arm64
+++ b/Makefile.arm64
@ -104,19 +104,25 @@ ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.4-a -mtune=neoverse-v1
 endif
 else
-CCOMMON_OPT += -march=armv8.4-a+sve -mtune=native
+CCOMMON_OPT += -march=armv8.4-a+sve 
+ifneq ($(CROSS), 1)
+CCOMMON_OPT += -mtune=native
+endif
 ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=armv8.4-a -mtune=native
+FCOMMON_OPT += -march=armv8.4-a 
+ifneq ($(CROSS), 1)
+FCOMMON_OPT += -mtune=native
+endif
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif
@ -132,25 +138,31 @@ ifeq (1, $(filter 1,$(GCCMINORVERSIONGTEQ4) $(GCCVERSIONGTEQ11) $(ISCLANG)))
 ifneq ($(OSNAME), Darwin)
 CCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 endif
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.5-a+sve+sve2+bf16 -mtune=neoverse-n2
 endif
 else
-CCOMMON_OPT += -march=armv8.5-a+sve -mtune=native
+CCOMMON_OPT += -march=armv8.5-a+sve
+ifneq ($(CROSS), 1)
+CCOMMON_OPT += -mtune=native
+endif
 ifneq ($(F_COMPILER), NAG)
-FCOMMON_OPT += -march=armv8.5-a -mtune=native
+FCOMMON_OPT += -march=armv8.5-a 
+ifneq ($(CROSS), 1)
+FCOMMON_OPT += -mtune=native
+endif
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8.2-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8.2-a -mtune=cortex-a72
 endif
 endif
 else
-CCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
+CCOMMON_OPT += -march=armv8-a+sve -mtune=cortex-a72
 ifneq ($(F_COMPILER), NAG)
 FCOMMON_OPT += -march=armv8-a -mtune=cortex-a72
 endif
--- a/Makefile.csky
+++ b/Makefile.csky
@ -0,0 +1,4 @@
+ifeq ($(CORE), CK860FV)
+CCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
+FCOMMON_OPT += -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float -static
+endif
--- a/Makefile.prebuild
+++ b/Makefile.prebuild
@ -55,6 +55,26 @@ ifeq ($(TARGET), C910V)
 TARGET_FLAGS = -march=rv64gcv0p7_zfh_xtheadc -mabi=lp64d
 endif

+ifeq ($(TARGET), CK860FV)
+TARGET_FLAGS = -march=ck860v -mcpu=ck860fv -mfdivdu -mhard-float
+endif
+
+ifeq ($(TARGET), x280)
+TARGET_FLAGS = -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d
+endif
+
+ifeq ($(TARGET), RISCV64_ZVL256B)
+TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+endif
+
+ifeq ($(TARGET), RISCV64_ZVL128B)
+TARGET_FLAGS = -march=rv64imafdcv -mabi=lp64d
+endif
+
+ifeq ($(TARGET), RISCV64_GENERIC)
+TARGET_FLAGS = -march=rv64imafdc -mabi=lp64d
+endif
+
 all: getarch_2nd
 	./getarch_2nd  0 >> $(TARGET_MAKE)
 	./getarch_2nd  1 >> $(TARGET_CONF)
--- a/Makefile.riscv64
+++ b/Makefile.riscv64
@ -2,3 +2,19 @@ ifeq ($(CORE), C910V)
 CCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920
 FCOMMON_OPT += -march=rv64imafdcv0p7_zfh_xtheadc -mabi=lp64d -mtune=c920 -static
 endif
+ifeq ($(CORE), x280)
+CCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh_zvl512b -mabi=lp64d -ffast-math
+FCOMMON_OPT += -march=rv64imafdcv_zba_zbb_zfh -mabi=lp64d -static
+endif
+ifeq ($(CORE), RISCV64_ZVL256B)
+CCOMMON_OPT += -march=rv64imafdcv_zvl256b -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
+endif
+ifeq ($(CORE), RISCV64_ZVL128B)
+CCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d 
+FCOMMON_OPT += -march=rv64imafdcv -mabi=lp64d -static
+endif
+ifeq ($(CORE), RISCV64_GENERIC)
+CCOMMON_OPT += -march=rv64imafdc -mabi=lp64d
+FCOMMON_OPT += -march=rv64imafdc -mabi=lp64d -static
+endif
--- a/Makefile.rule
+++ b/Makefile.rule
@ -3,7 +3,7 @@
 #

 # This library's version
-VERSION = 0.3.25.dev
+VERSION = 0.3.26.dev

 # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a
 # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library
--- a/Makefile.system
+++ b/Makefile.system
@ -677,16 +677,12 @@ ifeq ($(ARCH), arm64)
 DYNAMIC_CORE =  ARMV8
 DYNAMIC_CORE += CORTEXA53
 DYNAMIC_CORE += CORTEXA57
-DYNAMIC_CORE += CORTEXA72
-DYNAMIC_CORE += CORTEXA73
 DYNAMIC_CORE += NEOVERSEN1
 ifneq ($(NO_SVE), 1)
 DYNAMIC_CORE += NEOVERSEV1
 DYNAMIC_CORE += NEOVERSEN2
 DYNAMIC_CORE += ARMV8SVE
 endif
-DYNAMIC_CORE += CORTEXA55
-DYNAMIC_CORE += FALKOR
 DYNAMIC_CORE += THUNDERX
 DYNAMIC_CORE += THUNDERX2T99
 DYNAMIC_CORE += TSV110
@ -877,6 +873,11 @@ endif
 endif
 endif

+ifeq ($(ARCH), csky)
+NO_BINARY_MODE	= 1
+BINARY_DEFINED	= 1
+endif
+
 #
 #  C Compiler dependent settings
 #
--- a/Makefile.x86_64
+++ b/Makefile.x86_64
@ -130,11 +130,11 @@ ifeq ($(C_COMPILER), GCC)
  endif
 endif
 else ifeq ($(C_COMPILER), CLANG)
- # cooperlake support was added in clang 12
+ # sapphire rapids support was added in clang 12
 ifeq ($(CLANGVERSIONGTEQ12), 1)
-  CCOMMON_OPT += -march=cooperlake
+  CCOMMON_OPT += -march=sapphirerapids
  ifneq ($(F_COMPILER), NAG)
-   FCOMMON_OPT += -march=cooperlake
+   FCOMMON_OPT += -march=sapphirerapids
  endif
 else  # not supported in clang, fallback to avx512
  CCOMMON_OPT += -march=skylake-avx512
--- a/README.md
+++ b/README.md
@ -196,7 +196,12 @@ Please read `GotoBLAS_01Readme.txt` for older CPU models already supported by th
  ```sh
  make HOSTCC=gcc TARGET=C910V CC=riscv64-unknown-linux-gnu-gcc FC=riscv64-unknown-linux-gnu-gfortran
  ```
-  (also known to work on C906)
+  (also known to work on C906 as long as you use only single-precision functions - its instruction set support appears to be incomplete in double precision)
+
+- **x280**: Level-3 BLAS and Level-1,2 are optimized by RISC-V Vector extension 1.0.
+  ```sh
+  make HOSTCC=gcc TARGET=x280 NUM_THREADS=8 CC=riscv64-unknown-linux-gnu-clang FC=riscv64-unknown-linux-gnu-gfortran
+  ```

 ### Support for multiple targets in a single library

@ -207,9 +212,11 @@ For **x86_64**, the list of targets this activates contains Prescott, Core2, Neh
 `DYNAMIC_ARCH` is also supported on **x86**, where it translates to Katmai, Coppermine, Northwood, Prescott, Banias,
 Core2, Penryn, Dunnington, Nehalem, Athlon, Opteron, Opteron_SSE3, Barcelona, Bobcat, Atom and Nano.

-On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus.
+On **ARMV8**, it enables support for CortexA53, CortexA57, CortexA72, CortexA73, Falkor, ThunderX, ThunderX2T99, TSV110 as well as generic ARMV8 cpus. If compiler support for SVE is available at build time, support for NeoverseN2, NeoverseV1 as well as generic ArmV8SVE targets is also enabled.

-For **POWER**, the list encompasses POWER6, POWER8 and POWER9, on **ZARCH** it comprises Z13 and Z14.
+For **POWER**, the list encompasses POWER6, POWER8 and POWER9. POWER10 is additionally available if a sufficiently recent compiler is used for the build.
+
+on **ZARCH** it comprises Z13 and Z14 as well as generic zarch support.

 The `TARGET` option can be used in conjunction with `DYNAMIC_ARCH=1` to specify which cpu model should be assumed for all the
 common code in the library, usually you will want to set this to the oldest model you expect to encounter.
--- a/TargetList.txt
+++ b/TargetList.txt
@ -118,8 +118,11 @@ Z13
 Z14

 10.RISC-V 64:
-RISCV64_GENERIC
+RISCV64_GENERIC (e.g. PolarFire Soc/SiFive U54)
+RISCV64_ZVL128B
 C910V
+x280
+RISCV64_ZVL256B

 11.LOONGARCH64:
 LOONGSONGENERIC
@ -133,3 +136,7 @@ E2K
 EV4
 EV5
 EV6
+
+14.CSKY
+CSKY
+CK860FV
--- a/azure-pipelines.yml
+++ b/azure-pipelines.yml
@ -288,9 +288,9 @@ jobs:
     vmImage: 'ubuntu-latest'
  steps:
  - script: |
-        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.13.2/alpine-chroot-install \
-          && echo '60c7e0b5d82e21d1a549fc9a46ba3b36688c09dc  alpine-chroot-install' | sha1sum -c \
-          || exit 1
+        wget https://raw.githubusercontent.com/alpinelinux/alpine-chroot-install/v0.14.0/alpine-chroot-install \
+        && echo 'ccbf65f85cdc351851f8ad025bb3e65bae4d5b06  alpine-chroot-install' | sha1sum -c \
+        || exit 1
        alpine() { /alpine/enter-chroot -u "$USER" "$@"; }
        sudo sh alpine-chroot-install -p 'build-base gfortran perl linux-headers sudo'
        alpine make DYNAMIC_ARCH=1 BINARY=64
--- a/benchmark/Makefile
+++ b/benchmark/Makefile
@ -37,6 +37,12 @@ ESSL=/opt/ibm/lib
 #LIBESSL = -lesslsmp $(ESSL)/libxlomp_ser.so.1 $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a
 LIBESSL = -lesslsmp  $(ESSL)/libxlf90_r.so.1 $(ESSL)/libxlfmath.so.1 $(ESSL)/libxlsmp.so.1 /opt/ibm/xlC/13.1.3/lib/libxl.a

+# x280 temporary workaround for gfortran
+ifeq ($(TARGET), x280)
+CCOMMON_OPT:=$(filter-out -mllvm --riscv-v-vector-bits-min=512,$(CCOMMON_OPT))
+endif
+
+
 ifneq ($(NO_LAPACK), 1)
 GOTO_LAPACK_TARGETS=slinpack.goto dlinpack.goto clinpack.goto zlinpack.goto \
 		    scholesky.goto dcholesky.goto ccholesky.goto zcholesky.goto \
@ -3436,4 +3442,4 @@ smallscaling: smallscaling.c ../$(LIBNAME)
 clean ::
 	@rm -f *.goto *.mkl *.acml *.atlas *.veclib *.essl smallscaling

-include $(TOPDIR)/Makefile.tail
+include $(TOPDIR)/Makefile.tail
--- a/24
+++ b/24
@ -91,6 +91,7 @@ case "$data" in
    *ARCH_ZARCH*) architecture=zarch ;;
    *ARCH_RISCV64*) architecture=riscv64 ;;
    *ARCH_LOONGARCH64*) architecture=loongarch64 ;;
+    *ARCH_CSKY*) architecture=csky ;;
 esac

 defined=0
@ -236,6 +237,7 @@ case "$data" in
    *ARCH_ARM*) architecture=arm ;;
    *ARCH_ZARCH*) architecture=zarch ;;
    *ARCH_LOONGARCH64*) architecture=loongarch64 ;;
+    *ARCH_CSKY*) architecture=csky ;;
 esac

 binformat='bin32'
@ -244,6 +246,7 @@ case "$data" in
 esac

 no_avx512=0
+no_avx512bf=0
 if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
    tmpf="$tmpd/a.c"
@ -262,6 +265,25 @@ if [ "$architecture" = "x86" ] || [ "$architecture" = "x86_64" ]; then
    }

    rm -rf "$tmpd"
+    if [ "$no_avx512" -eq 0 ]; then
+    tmpd=$(mktemp -d 2>/dev/null || mktemp -d -t 'OBC')
+    tmpf="$tmpd/a.c"
+    code='"__m512 a= _mm512_dpbf16_ps(a, (__m512bh) _mm512_loadu_si512(%1]), (__m512bh) _mm512_loadu_si512(%2]));"'
+    printf "#include <immintrin.h>\n\nint main(void){ %s; }\n" "$code" >> "$tmpf"
+    if [ "$compiler" = "PGI" ]; then
+        args=" -tp cooperlake -c -o $tmpf.o $tmpf"
+    else
+        args=" -march=cooperlake -c -o $tmpf.o $tmpf"
+    fi
+    no_avx512bf=0
+    {
+        $compiler_name $flags $args >/dev/null 2>&1
+    } || {
+        no_avx512bf=1
+    }
+
+    rm -rf "$tmpd"
+  fi
 fi

 no_rv64gv=0
@ -409,6 +431,7 @@ done
 [ "$makefile" = "-" ] && {
    [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
    [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+    [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
    [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
    [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
    exit 0
@ -437,6 +460,7 @@ done
    [ "$no_sve" -eq 1 ] && printf "NO_SVE=1\n"
    [ "$no_rv64gv" -eq 1 ] && printf "NO_RV64GV=1\n"
    [ "$no_avx512" -eq 1 ] && printf "NO_AVX512=1\n"
+    [ "$no_avx512bf" -eq 1 ] && printf "NO_AVX512BF16=1\n"
    [ "$no_avx2" -eq 1 ] && printf "NO_AVX2=1\n"
    [ "$oldgcc" -eq 1 ] && printf "OLDGCC=1\n"
    [ "$no_lsx" -eq 1 ] && printf "NO_LSX=1\n"
--- a/c_check.pl
+++ b/c_check.pl
@ -97,6 +97,7 @@ $architecture = arm64        if ($data =~ /ARCH_ARM64/);
 $architecture = zarch        if ($data =~ /ARCH_ZARCH/);
 $architecture = riscv64      if ($data =~ /ARCH_RISCV64/);
 $architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
+$architecture = csky         if ($data =~ /ARCH_CSKY/);

 $defined = 0;

@ -156,6 +157,11 @@ if ($architecture eq "loongarch64") {
    $binary = 64;
 }

+if ($architecture eq "csky") {
+    $defined = 1;
+    $binary = 32;
+}
+
 if ($compiler eq "PGI") {
    $compiler_name .= " -tp p7"    if ($binary eq "32");
    $compiler_name .= " -tp p7-64" if ($binary eq "64");
@ -284,6 +290,7 @@ $architecture = arm          if ($data =~ /ARCH_ARM/);
 $architecture = arm64        if ($data =~ /ARCH_ARM64/);
 $architecture = zarch        if ($data =~ /ARCH_ZARCH/);
 $architecture = loongarch64  if ($data =~ /ARCH_LOONGARCH64/);
+$architecture = csky         if ($data =~ /ARCH_CSKY/);

 $binformat    = bin32;
 $binformat    = bin64  if ($data =~ /BINARY_64/);
--- a/cblas.h
+++ b/cblas.h
@ -12,6 +12,7 @@ extern "C" {
 /*Set the number of threads on runtime.*/
 void openblas_set_num_threads(int num_threads);
 void goto_set_num_threads(int num_threads);
+int openblas_set_num_threads_local(int num_threads);

 /*Get the number of threads on runtime.*/
 int openblas_get_num_threads(void);
@ -100,6 +101,16 @@ CBLAS_INDEX cblas_idamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPE
 CBLAS_INDEX cblas_icamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_izamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);

+float cblas_samax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
+float cblas_samin(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
+double cblas_damin(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
+float cblas_scamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
+double cblas_dzamin(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx);
+
 CBLAS_INDEX cblas_ismax(OPENBLAS_CONST blasint n, OPENBLAS_CONST float  *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_idmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx);
 CBLAS_INDEX cblas_icmax(OPENBLAS_CONST blasint n, OPENBLAS_CONST void  *x, OPENBLAS_CONST blasint incx);
@ -115,6 +126,9 @@ void cblas_daxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double alpha, OPENBLAS
 void cblas_caxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
 void cblas_zaxpy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);

+void cblas_caxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+void cblas_zaxpyc(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
+
 void cblas_scopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST float *x, OPENBLAS_CONST blasint incx, float *y, OPENBLAS_CONST blasint incy);
 void cblas_dcopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST double *x, OPENBLAS_CONST blasint incx, double *y, OPENBLAS_CONST blasint incy);
 void cblas_ccopy(OPENBLAS_CONST blasint n, OPENBLAS_CONST void *x, OPENBLAS_CONST blasint incx, void *y, OPENBLAS_CONST blasint incy);
@ -289,6 +303,14 @@ void cblas_zgemm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLA
 void cblas_zgemm3m(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N, OPENBLAS_CONST blasint K,
 		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

+void cblas_sgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
+void cblas_dgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST double alpha, OPENBLAS_CONST double *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST double *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST double beta, double *C, OPENBLAS_CONST blasint ldc);
+void cblas_cgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);
+void cblas_zgemmt(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransA, OPENBLAS_CONST enum CBLAS_TRANSPOSE TransB, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint K,
+		 OPENBLAS_CONST void *alpha, OPENBLAS_CONST void *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST void *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST void *beta, void *C, OPENBLAS_CONST blasint ldc);

 void cblas_ssymm(OPENBLAS_CONST enum CBLAS_ORDER Order, OPENBLAS_CONST enum CBLAS_SIDE Side, OPENBLAS_CONST enum CBLAS_UPLO Uplo, OPENBLAS_CONST blasint M, OPENBLAS_CONST blasint N,
                 OPENBLAS_CONST float alpha, OPENBLAS_CONST float *A, OPENBLAS_CONST blasint lda, OPENBLAS_CONST float *B, OPENBLAS_CONST blasint ldb, OPENBLAS_CONST float beta, float *C, OPENBLAS_CONST blasint ldc);
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@ -44,7 +44,7 @@ endif ()

 if (DYNAMIC_ARCH)
  if (ARM64)
-	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA55 CORTEXA57 CORTEXA72 CORTEXA73 FALKOR THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
+	  set(DYNAMIC_CORE ARMV8 CORTEXA53 CORTEXA57 THUNDERX THUNDERX2T99 TSV110 EMAG8180 NEOVERSEN1 THUNDERX3T110)
    if (${CMAKE_C_COMPILER_VERSION} VERSION_GREATER 9.99)
          set(DYNAMIC_CORE ${DYNAMIC_CORE} NEOVERSEV1 NEOVERSEN2 ARMV8SVE)
    endif ()
--- a/cmake/cc.cmake
+++ b/cmake/cc.cmake
@ -36,9 +36,19 @@ if (${CMAKE_C_COMPILER_ID} STREQUAL "GNU" OR ${CMAKE_C_COMPILER_ID} STREQUAL "LS

    if (LOONGARCH64)
      if (BINARY64)
-        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+        if(COMPILER_SUPPORT_LP64D_ABI)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64d")
+	else()
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp64")
+	endif ()
      else ()
-        set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	if(COMPILER_SUPPORT_ILP32D_ABI)
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=ilp32d")
+	else()
+	  set(CCOMMON_OPT "${CCOMMON_OPT} -mabi=lp32")
+	endif ()
      endif ()
      set(BINARY_DEFINED 1)
    endif ()
@ -282,6 +292,27 @@ if (${CORE} STREQUAL POWER8)
  endif ()
 endif ()

+# With -mcpu=970 added it compiles, but library is broken, at least on macOS. If someone
+# tests on *BSD or Linux and adds this flag, please make sure it is not used for macOS case.
+if (${CORE} STREQUAL PPC970)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mtune=970 -maltivec -fno-fast-math")
+  endif ()
+  if (APPLE)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")
+  endif ()
+endif ()
+
+# -mcpu=G4 seems to work fine, but perhaps avoid it for the sake of consistency?
+if (${CORE} STREQUAL PPCG4)
+  if (NOT DYNAMIC_ARCH)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -mtune=G4 -maltivec -fno-fast-math")
+  endif ()
+  if (APPLE)
+    set (CCOMMON_OPT  "${CCOMMON_OPT} -force_cpusubtype_ALL")
+  endif ()
+endif ()
+
 if (NOT DYNAMIC_ARCH)
 	if (HAVE_AVX2)
        set (CCOMMON_OPT  "${CCOMMON_OPT} -mavx2")
--- a/cmake/fc.cmake
+++ b/cmake/fc.cmake
@ -61,9 +61,19 @@ if (${F_COMPILER} STREQUAL "GFORTRAN" OR ${F_COMPILER} STREQUAL "F95" OR CMAKE_F
    endif ()
    if (LOONGARCH64)
      if (BINARY64)
-        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+	CHECK_CXX_COMPILER_FLAG("-mabi=lp64d" COMPILER_SUPPORT_LP64D_ABI)
+        if(COMPILER_SUPPORT_LP64D_ABI)
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64d")
+	else()
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp64")
+	endif ()
      else ()
-        set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+	CHECK_CXX_COMPILER_FLAG("-mabi=ilp32d" COMPILER_SUPPORT_ILP32D_ABI)
+	if(COMPILER_SUPPORT_ILP32D_ABI)
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=ilp32d")
+	else()
+	  set(FCOMMON_OPT "${FCOMMON_OPT} -mabi=lp32")
+	endif ()
      endif ()
    endif ()
    if (RISCV64)
--- a/cmake/openblas.pc.in
+++ b/cmake/openblas.pc.in
@ -5,7 +5,7 @@ includedir=@CMAKE_INSTALL_FULL_INCLUDEDIR@
 openblas_config=USE_64BITINT=@INTERFACE64@ NO_CBLAS=@NO_CBLAS@ NO_LAPACK=@NO_LAPACK@ NO_LAPACKE=@NO_LAPACKE@ DYNAMIC_ARCH=@DYNAMIC_ARCH@ DYNAMIC_OLDER=@DYNAMIC_OLDER@ NO_AFFINITY=@NO_AFFINITY@ USE_OPENMP=@USE_OPENMP@ @CORE@ MAX_THREADS=@NUM_THREADS@ 
 Name: OpenBLAS
 Description: OpenBLAS is an optimized BLAS library based on GotoBLAS2 1.13 BSD version
-Version: @OPENBLAS_VERSION@
-URL: https://github.com/xianyi/OpenBLAS
+Version: @OpenBLAS_VERSION@
+URL: https://github.com/OpenMathLib/OpenBLAS
 Libs: @OpenMP_C_FLAGS@ -L${libdir} -lopenblas${libsuffix} 
 Cflags: -I${includedir}
--- a/cmake/system_check.cmake
+++ b/cmake/system_check.cmake
@ -38,7 +38,7 @@ if(CMAKE_CL_64 OR MINGW64)
  endif()
 elseif(MINGW OR (MSVC AND NOT CMAKE_CROSSCOMPILING))
  set(X86 1)
-elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*")
+elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "ppc.*|power.*|Power.*" OR (CMAKE_SYSTEM_NAME MATCHES "Darwin" AND CMAKE_OSX_ARCHITECTURES MATCHES "ppc.*"))
  set(POWER 1)
 elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "mips64.*")
  set(MIPS64 1)
@ -109,7 +109,7 @@ else()
 endif ()

 if (NOT BINARY)
-  if (X86_64 OR ARM64 OR POWER OR MIPS64 OR LOONGARCH64 OR RISCV64)
+  if (X86_64 OR ARM64 OR MIPS64 OR LOONGARCH64 OR RISCV64 OR (POWER AND NOT (CMAKE_OSX_ARCHITECTURES STREQUAL "ppc")))
    set(BINARY 64)
  else ()
    set(BINARY 32)
--- a/common.h
+++ b/common.h
@ -396,7 +396,7 @@ typedef int blasint;
 #endif

 /***
-To alloc job_t on heap or statck.
+To alloc job_t on heap or stack.
 please https://github.com/xianyi/OpenBLAS/issues/246
 ***/
 #if defined(OS_WINDOWS)
@ -482,6 +482,10 @@ please https://github.com/xianyi/OpenBLAS/issues/246
 #include "common_e2k.h"
 #endif

+#ifdef ARCH_CSKY
+#include "common_csky.h"
+#endif
+
 #ifndef ASSEMBLER
 #ifdef OS_WINDOWSSTORE
 typedef char env_var_t[MAX_PATH];
--- a/common_csky.h
+++ b/common_csky.h
@ -0,0 +1,56 @@
+/*****************************************************************************
+Copyright (c) 2011-2015, The OpenBLAS Project
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+   1. Redistributions of source code must retain the above copyright
+      notice, this list of conditions and the following disclaimer.
+
+   2. Redistributions in binary form must reproduce the above copyright
+      notice, this list of conditions and the following disclaimer in
+      the documentation and/or other materials provided with the
+      distribution.
+   3. Neither the name of the OpenBLAS project nor the names of 
+      its contributors may be used to endorse or promote products 
+      derived from this software without specific prior written 
+      permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+**********************************************************************************/
+
+#ifndef COMMON_CSKY
+#define COMMON_CSKY
+
+#define MB  __sync_synchronize()
+#define WMB __sync_synchronize()
+#define RMB __sync_synchronize()
+
+#define INLINE inline
+
+#ifndef ASSEMBLER
+
+
+static inline int blas_quickdivide(blasint x, blasint y){
+  return x / y;
+}
+
+#endif
+
+
+
+#define BUFFER_SIZE     ( 32 << 20)
+#define SEEK_ADDRESS
+
+#endif
--- a/common_interface.h
+++ b/common_interface.h
@ -498,6 +498,15 @@ void BLASFUNC(zgemm3m)(char *, char *, blasint *, blasint *, blasint *, double *
 void BLASFUNC(xgemm3m)(char *, char *, blasint *, blasint *, blasint *, xdouble *,
 	   xdouble *, blasint *, xdouble *, blasint *, xdouble *, xdouble *, blasint *);

+void BLASFUNC(sgemmt)(char*, char *, char *, blasint *, blasint *, float *,
+	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
+void BLASFUNC(dgemmt)(char*, char *, char *, blasint *, blasint *, double *,
+	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
+void BLASFUNC(cgemmt)(char*, char *, char *, blasint *, blasint *, float *,
+	   float  *, blasint *, float  *, blasint *, float  *, float  *, blasint *);
+void BLASFUNC(zgemmt)(char*, char *, char *, blasint *, blasint *, double *,
+	   double *, blasint *, double *, blasint *, double *, double *, blasint *);
+
 int BLASFUNC(sge2mm)(char *, char *, char *, blasint *, blasint *,
 		     float *, float  *, blasint *, float  *, blasint *,
 		     float *, float  *, blasint *);
@ -764,8 +773,8 @@ xdouble   BLASFUNC(qlamc3)(xdouble *, xdouble *);

 void    BLASFUNC(saxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
 void    BLASFUNC(daxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
-void    BLASFUNC(caxpby) (blasint *, float  *, float  *, blasint *, float *, float  *, blasint *);
-void    BLASFUNC(zaxpby) (blasint *, double  *, double  *, blasint *, double *, double  *, blasint *);
+void    BLASFUNC(caxpby) (blasint *, void  *, float  *, blasint *, void *, float  *, blasint *);
+void    BLASFUNC(zaxpby) (blasint *, void  *, double *, blasint *, void *, double  *, blasint *);

 void    BLASFUNC(somatcopy) (char *, char *, blasint *, blasint *, float  *, float  *, blasint *, float  *, blasint *);
 void    BLASFUNC(domatcopy) (char *, char *, blasint *, blasint *, double  *, double  *, blasint *, double  *, blasint *);
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@ -119,19 +119,47 @@ static inline int WhereAmI(void){
 #define MOV     fmov.d
 #define CMOVT   fsel
 #define MTC     movgr2fr.d
+#define MTG     movfr2gr.d
 #define FABS    fabs.d
+#define FMIN    fmin.d
+#define FMINA   fmina.d
+#define FMAX    fmax.d
+#define FMAXA   fmaxa.d
 #define CMPEQ   fcmp.ceq.d
 #define CMPLE   fcmp.cle.d
 #define CMPLT   fcmp.clt.d
 #define NEG     fneg.d
+#define FFINT   ffint.d.l

 #define XVFSUB  xvfsub.d
 #define XVFADD  xvfadd.d
+#define XVFMUL  xvfmul.d
 #define XVFMADD xvfmadd.d
+#define XVFMIN  xvfmin.d
+#define XVFMINA xvfmina.d
+#define XVFMAX  xvfmax.d
+#define XVFMAXA xvfmaxa.d
+#define XVCMPEQ xvfcmp.ceq.d
+#define XVCMPLE xvfcmp.cle.d
+#define XVCMPLT xvfcmp.clt.d
+#define XVMUL   xvfmul.d
+#define XVMSUB  xvfmsub.d
+#define XVNMSUB xvfnmsub.d

 #define VFSUB  vfsub.d
 #define VFADD  vfadd.d
+#define VFMUL  vfmul.d
 #define VFMADD vfmadd.d
+#define VFMIN  vfmin.d
+#define VFMINA vfmina.d
+#define VFMAX  vfmax.d
+#define VFMAXA vfmaxa.d
+#define VCMPEQ vfcmp.ceq.d
+#define VCMPLE vfcmp.cle.d
+#define VCMPLT vfcmp.clt.d
+#define VMUL   vfmul.d
+#define VMSUB  vfmsub.d
+#define VNMSUB vfnmsub.d

 #else

@ -147,19 +175,47 @@ static inline int WhereAmI(void){
 #define MOV     fmov.s
 #define CMOVT   fsel
 #define MTC     movgr2fr.w
+#define MTG     movfr2gr.s
 #define FABS    fabs.s
+#define FMIN    fmin.s
+#define FMINA   fmina.s
+#define FMAX    fmax.s
+#define FMAXA   fmaxa.s
 #define CMPEQ   fcmp.ceq.s
 #define CMPLE   fcmp.cle.s
 #define CMPLT   fcmp.clt.s
 #define NEG     fneg.s
+#define FFINT   ffint.s.l

 #define XVFSUB  xvfsub.s
 #define XVFADD  xvfadd.s
+#define XVFMUL  xvfmul.s
 #define XVFMADD xvfmadd.s
+#define XVFMIN  xvfmin.s
+#define XVFMINA xvfmina.s
+#define XVFMAX  xvfmax.s
+#define XVFMAXA xvfmaxa.s
+#define XVCMPEQ xvfcmp.ceq.s
+#define XVCMPLE xvfcmp.cle.s
+#define XVCMPLT xvfcmp.clt.s
+#define XVMUL   xvfmul.s
+#define XVMSUB  xvfmsub.s
+#define XVNMSUB xvfnmsub.s

 #define VFSUB  vfsub.s
 #define VFADD  vfadd.s
+#define VFMUL  vfmul.s
 #define VFMADD vfmadd.s
+#define VFMIN  vfmin.s
+#define VFMINA vfmina.s
+#define VFMAX  vfmax.s
+#define VFMAXA vfmaxa.s
+#define VCMPEQ vfcmp.ceq.s
+#define VCMPLE vfcmp.cle.s
+#define VCMPLT vfcmp.clt.s
+#define VMUL   vfmul.s
+#define VMSUB  vfmsub.s
+#define VNMSUB vfnmsub.s

 #endif /* defined(DOUBLE) */

--- a/common_riscv64.h
+++ b/common_riscv64.h
@ -91,8 +91,26 @@ static inline int blas_quickdivide(blasint x, blasint y){
 #define BUFFER_SIZE     ( 32 << 20)
 #define SEEK_ADDRESS

-#if defined(C910V)
-#include <riscv_vector.h>
+#if defined(C910V) || (defined(RISCV64_ZVL256B) && (defined(__clang__) || defined(RVV_COMPATIBLE_GCC))) || defined(RISCV64_ZVL128B) || defined(x280)
+# include <riscv_vector.h>
+#endif
+
+#if defined( __riscv_xtheadc ) && defined( __riscv_v ) && ( __riscv_v <= 7000 )
+// t-head toolchain uses obsolete rvv intrinsics, can't build for C910V without this
+#define RISCV_0p10_INTRINSICS
+#define RISCV_RVV(x) x
+#else
+#define RISCV_RVV(x) __riscv_ ## x
+#endif
+
+#if defined(C910V) || defined(RISCV64_ZVL256B)
+# if !defined(DOUBLE)
+#  define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f32m1_f32)(v)
+# else
+#  define EXTRACT_FLOAT(v) RISCV_RVV(vfmv_f_s_f64m1_f64)(v)
+# endif
+#else
+# define EXTRACT_FLOAT(v) (v[0])
 #endif

 #endif
--- a/common_thread.h
+++ b/common_thread.h
@ -137,19 +137,20 @@ typedef struct blas_queue {

 extern int blas_server_avail;
 extern int blas_omp_number_max;
+extern int blas_omp_threads_local;

 static __inline int num_cpu_avail(int level) {

 #ifdef USE_OPENMP
 int openmp_nthreads;
 	openmp_nthreads=omp_get_max_threads();
+	if (omp_in_parallel()) openmp_nthreads = blas_omp_threads_local;
 #endif

 #ifndef USE_OPENMP 
  if (blas_cpu_number == 1
-#endif
-#ifdef USE_OPENMP
-     if (openmp_nthreads == 1 || omp_in_parallel()
+#else
+     if (openmp_nthreads == 1 
 #endif
      ) return 1;        

--- a/cpuid_power.c
+++ b/cpuid_power.c
@ -160,6 +160,7 @@ int detect(void){
  infoCount = HOST_BASIC_INFO_COUNT;
  host_info(mach_host_self(), HOST_BASIC_INFO, (host_info_t)&hostInfo, &infoCount);

+  if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7400) return CPUTYPE_PPCG4;
  if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_7450) return CPUTYPE_PPCG4;
  if (hostInfo.cpu_subtype == CPU_SUBTYPE_POWERPC_970)  return CPUTYPE_PPC970;

--- a/cpuid_riscv64.c
+++ b/cpuid_riscv64.c
@ -70,12 +70,26 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* or implied, of The University of Texas at Austin.                 */
 /*********************************************************************/

-#define CPU_GENERIC   0
-#define CPU_C910V     1
+#define CPU_GENERIC         0
+#define CPU_C910V           1
+#define CPU_x280            2
+#define CPU_RISCV64_ZVL256B 3
+#define CPU_RISCV64_ZVL128B 4

 static char *cpuname[] = {
  "RISCV64_GENERIC",
-  "C910V"
+  "C910V",
+  "x280",
+  "CPU_RISCV64_ZVL256B",
+  "CPU_RISCV64_ZVL128B"
+};
+
+static char *cpuname_lower[] = {
+  "riscv64_generic",
+  "c910v",
+  "x280",
+  "riscv64_zvl256b",
+  "riscv64_zvl128b"
 };

 int detect(void){
@ -86,23 +100,29 @@ int detect(void){
  char *pmodel = NULL, *pisa = NULL;

  infile = fopen("/proc/cpuinfo", "r");
+  if (!infile)
+    return CPU_GENERIC;
  while (fgets(buffer, sizeof(buffer), infile)){
    if(!strncmp(buffer, "model name", 10)){
      strcpy(model_buffer, buffer);
-      pmodel = strchr(isa_buffer, ':') + 1;
+      pmodel = strchr(model_buffer, ':');
+      if (pmodel)
+        pmodel++;
    }

    if(!strncmp(buffer, "isa", 3)){
      strcpy(isa_buffer, buffer);
-      pisa = strchr(isa_buffer, '4') + 1;
+      pisa = strchr(isa_buffer, '4');
+      if (pisa)
+        pisa++;
    }
  }

  fclose(infile);

-  if (!pmodel)
+  if (!pmodel || !pisa)
   return(CPU_GENERIC);
-   
+
  if (strstr(pmodel, check_c910_str) && strchr(pisa, 'v'))
    return CPU_C910V;

@ -140,5 +160,5 @@ void get_cpuconfig(void){
 }

 void get_libname(void){
-  printf("riscv64\n");
+  printf("%s", cpuname_lower[detect()]);
 }
--- a/ctest.c
+++ b/ctest.c
@ -173,6 +173,10 @@ HAVE_C11
 ARCH_E2K
 #endif

+#if defined(__csky__)
+ARCH_CSKY
+#endif
+
 #if defined(__EMSCRIPTEN__)
 ARCH_RISCV64
 OS_WINDOWS
--- a/ctest/Makefile
+++ b/ctest/Makefile
@ -218,6 +218,9 @@ ifeq ($(F_COMPILER), IBM)
 ifeq ($(C_COMPILER), GCC)
 CEXTRALIB += -lgomp
 endif
+ifeq ($(C_COMPILER), CLANG)
+CEXTRALIB += -lomp
+endif
 endif
 endif

--- a/ctest/c_cblat1.f
+++ b/ctest/c_cblat1.f
@ -96,7 +96,7 @@
      INTEGER           ICAMAXTEST
      EXTERNAL          SCASUMTEST, SCNRM2TEST, ICAMAXTEST
 *     .. External Subroutines ..
-      EXTERNAL          CSCAL, CSSCALTEST, CTEST, ITEST1, STEST1
+      EXTERNAL          CSCALTEST, CSSCALTEST, CTEST, ITEST1, STEST1
 *     .. Intrinsic Functions ..
      INTRINSIC         MAX
 *     .. Common blocks ..
@ -214,8 +214,8 @@
               CALL STEST1(SCASUMTEST(N,CX,INCX),STRUE4(NP1),
     +                     STRUE4(NP1),SFAC)
            ELSE IF (ICASE.EQ.8) THEN
-*              .. CSCAL ..
-               CALL CSCAL(N,CA,CX,INCX)
+*              .. CSCALTEST ..
+               CALL CSCALTEST(N,CA,CX,INCX)
               CALL CTEST(LEN,CX,CTRUE5(1,NP1,INCX),CTRUE5(1,NP1,INCX),
     +                    SFAC)
            ELSE IF (ICASE.EQ.9) THEN
@ -236,14 +236,14 @@
 *
      INCX = 1
      IF (ICASE.EQ.8) THEN
-*        CSCAL
+*        CSCALTEST
 *        Add a test for alpha equal to zero.
         CA = (0.0E0,0.0E0)
         DO 80 I = 1, 5
            MWPCT(I) = (0.0E0,0.0E0)
            MWPCS(I) = (1.0E0,1.0E0)
   80    CONTINUE
-         CALL CSCAL(5,CA,CX,INCX)
+         CALL CSCALTEST(5,CA,CX,INCX)
         CALL CTEST(5,CX,MWPCT,MWPCS,SFAC)
      ELSE IF (ICASE.EQ.9) THEN
 *        CSSCALTEST
--- a/ctest/c_cblat1c.c
+++ b/ctest/c_cblat1c.c
@ -440,6 +440,7 @@ static real c_b43 = (float)1.;
    extern /* Subroutine */ int ctest_(integer*, complex*, complex*, complex*, real*);
    static complex mwpcs[5], mwpct[5];
    extern /* Subroutine */ int itest1_(integer*, integer*), stest1_(real*,real*,real*,real*);
+    extern /* Subroutine */ int cscaltest_(), itest1_(), stest1_();
    static complex cx[8];
    extern real scnrm2test_(integer*, complex*, integer*);
    static integer np1;
@ -481,7 +482,7 @@ static real c_b43 = (float)1.;
 		stest1_(&r__1, &strue4[np1 - 1], &strue4[np1 - 1], sfac);
 	    } else if (combla_1.icase == 8) {
 /*              .. CSCAL .. */
-		cscal_(&combla_1.n, &ca, cx, &combla_1.incx);
+		cscaltest_(&combla_1.n, &ca, cx, &combla_1.incx);
 		ctest_(&len, cx, &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48],
 			 &ctrue5[(np1 + combla_1.incx * 5 << 3) - 48], sfac);
 	    } else if (combla_1.icase == 9) {
@ -515,7 +516,7 @@ static real c_b43 = (float)1.;
 	    mwpcs[i__1].r = (float)1., mwpcs[i__1].i = (float)1.;
 /* L80: */
 	}
-	cscal_(&c__5, &ca, cx, &combla_1.incx);
+	cscaltest_(&c__5, &ca, cx, &combla_1.incx);
 	ctest_(&c__5, cx, mwpct, mwpcs, sfac);
    } else if (combla_1.icase == 9) {
 /*        CSSCALTEST */
--- a/driver/others/blas_server.c
+++ b/driver/others/blas_server.c
@ -113,6 +113,8 @@ extern unsigned int openblas_thread_timeout(void);
 /* We need this global for checking if initialization is finished.  */
 int blas_server_avail   __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;

+int blas_omp_threads_local = 1;
+
 /* Local Variables */
 #if   defined(USE_PTHREAD_LOCK)
 static pthread_mutex_t  server_lock    = PTHREAD_MUTEX_INITIALIZER;
--- a/driver/others/blas_server_omp.c
+++ b/driver/others/blas_server_omp.c
@ -69,6 +69,7 @@

 int blas_server_avail = 0;
 int blas_omp_number_max = 0;
+int blas_omp_threads_local = 1;

 extern int openblas_omp_adaptive_env(void);

--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@ -65,6 +65,8 @@ static CRITICAL_SECTION queue_lock;
 /* We need this global for checking if initialization is finished.   */
 int blas_server_avail = 0;

+int blas_omp_threads_local = 1;
+
 /* Local Variables */
 static BLASULONG server_lock       = 0;

--- a/driver/others/dynamic.c
+++ b/driver/others/dynamic.c
@ -275,6 +275,7 @@ extern gotoblas_t  gotoblas_EXCAVATOR;
 #define gotoblas_SKYLAKEX gotoblas_SANDYBRIDGE
 #define gotoblas_COOPERLAKE gotoblas_SANDYBRIDGE
 #define gotoblas_ZEN gotoblas_SANDYBRIDGE
+#define gotoblas_SAPPHIRERAPIDS gotoblas_SANDYBRIDGE
 #else
 extern gotoblas_t  gotoblas_HASWELL;
 extern gotoblas_t  gotoblas_ZEN;
--- a/driver/others/dynamic_arm64.c
+++ b/driver/others/dynamic_arm64.c
@ -1,6 +1,6 @@
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
-/* Copyright 2023 The OpenBLAS Project                               */
+/* Copyright 2023-2024 The OpenBLAS Project                          */
 /* All rights reserved.                                              */
 /*                                                                   */
 /* Redistribution and use in source and binary forms, with or        */
@ -122,10 +122,11 @@ extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
 #else
 extern gotoblas_t  gotoblas_CORTEXA53;
+#define gotoblas_CORTEXA55 gotoblas_CORTEXA53
 extern gotoblas_t  gotoblas_CORTEXA57;
-extern gotoblas_t  gotoblas_CORTEXA72;
-extern gotoblas_t  gotoblas_CORTEXA73;
-extern gotoblas_t  gotoblas_FALKOR;
+#define gotoblas_CORTEXA72 gotoblas_CORTEXA57
+#define gotoblas_CORTEXA73 gotoblas_CORTEXA57
+#define gotoblas_FALKOR gotoblas_CORTEXA57
 extern gotoblas_t  gotoblas_THUNDERX;
 extern gotoblas_t  gotoblas_THUNDERX2T99;
 extern gotoblas_t  gotoblas_TSV110;
@ -141,14 +142,14 @@ extern gotoblas_t  gotoblas_ARMV8SVE;
 #define gotoblas_ARMV8SVE   gotoblas_ARMV8
 #endif
 extern gotoblas_t  gotoblas_THUNDERX3T110;
-extern gotoblas_t  gotoblas_CORTEXA55;
 #endif
+#define gotoblas_NEOVERSEV2 gotoblas_NEOVERSEV1

 extern void openblas_warning(int verbose, const char * msg);
 #define FALLBACK_VERBOSE 1
 #define NEOVERSEN1_FALLBACK "OpenBLAS : Your OS does not support SVE instructions. OpenBLAS is using Neoverse N1 kernels as a fallback, which may give poorer performance.\n"

-#define NUM_CORETYPES   16
+#define NUM_CORETYPES   17

 /*
 * In case asm/hwcap.h is outdated on the build system, make sure
@ -178,6 +179,7 @@ static char *corename[] = {
  "emag8180",
  "neoversen1",
  "neoversev1",
+  "neoversev2",
  "neoversen2",
  "thunderx3t110",
  "cortexa55",
@ -198,10 +200,11 @@ char *gotoblas_corename(void) {
  if (gotoblas == &gotoblas_EMAG8180)     return corename[ 9];
  if (gotoblas == &gotoblas_NEOVERSEN1)   return corename[10];
  if (gotoblas == &gotoblas_NEOVERSEV1)   return corename[11];
-  if (gotoblas == &gotoblas_NEOVERSEN2)   return corename[12];
-  if (gotoblas == &gotoblas_THUNDERX3T110) return corename[13];
-  if (gotoblas == &gotoblas_CORTEXA55)    return corename[14];
-  if (gotoblas == &gotoblas_ARMV8SVE)     return corename[15];
+  if (gotoblas == &gotoblas_NEOVERSEV2)   return corename[12];
+  if (gotoblas == &gotoblas_NEOVERSEN2)   return corename[13];
+  if (gotoblas == &gotoblas_THUNDERX3T110) return corename[14];
+  if (gotoblas == &gotoblas_CORTEXA55)    return corename[15];
+  if (gotoblas == &gotoblas_ARMV8SVE)     return corename[16];
  return corename[NUM_CORETYPES];
 }

@ -233,10 +236,11 @@ static gotoblas_t *force_coretype(char *coretype) {
    case  9: return (&gotoblas_EMAG8180);
    case 10: return (&gotoblas_NEOVERSEN1);
    case 11: return (&gotoblas_NEOVERSEV1);
-    case 12: return (&gotoblas_NEOVERSEN2);
-    case 13: return (&gotoblas_THUNDERX3T110);
-    case 14: return (&gotoblas_CORTEXA55);
-    case 15: return (&gotoblas_ARMV8SVE);
+    case 12: return (&gotoblas_NEOVERSEV2);
+    case 13: return (&gotoblas_NEOVERSEN2);
+    case 14: return (&gotoblas_THUNDERX3T110);
+    case 15: return (&gotoblas_CORTEXA55);
+    case 16: return (&gotoblas_ARMV8SVE);
  }
  snprintf(message, 128, "Core not found: %s\n", coretype);
  openblas_warning(1, message);
@ -247,6 +251,10 @@ static gotoblas_t *get_coretype(void) {
  int implementer, variant, part, arch, revision, midr_el1;
  char coremsg[128];

+#if defined (OS_DARWIN)
+  return &gotoblas_NEOVERSEN1;
+#endif
+	
 #if (!defined OS_LINUX && !defined OS_ANDROID)
  return NULL;
 #else
@ -308,6 +316,13 @@ static gotoblas_t *get_coretype(void) {
 	    return &gotoblas_NEOVERSEN1;
      	  }else
 	    return &gotoblas_NEOVERSEV1;
+  case 0xd4f:
+      if (!(getauxval(AT_HWCAP) & HWCAP_SVE)) {
+        openblas_warning(FALLBACK_VERBOSE, NEOVERSEN1_FALLBACK);
+        return &gotoblas_NEOVERSEN1;
+      } else {
+	      return &gotoblas_NEOVERSEV2;
+      }
 #endif
 	case 0xd05: // Cortex A55
 	  return &gotoblas_CORTEXA55;
@ -352,6 +367,9 @@ static gotoblas_t *get_coretype(void) {
          return &gotoblas_FALKOR;
      }
      break;
+    case 0x61: // Apple
+	return &gotoblas_NEOVERSEN1;
+      break;
    default:
      snprintf(coremsg, 128, "Unknown CPU model - implementer %x part %x\n",implementer,part);
      openblas_warning(1, coremsg);
--- a/driver/others/dynamic_power.c
+++ b/driver/others/dynamic_power.c
@ -43,6 +43,13 @@ char *gotoblas_corename(void) {
 #define CPU_POWER9   9
 #define CPU_POWER10 10

+#ifndef POWER_9
+#define POWER_9         0x20000         /* 9 class CPU */
+#endif
+#ifndef POWER_10
+#define POWER_10        0x40000         /* 10 class CPU */
+#endif
+
 #ifdef _AIX
 #include <sys/systemcfg.h>

@ -62,7 +69,7 @@ static int cpuid(void)
    else if (arch == POWER_9) return CPU_POWER9;
 #endif
 #ifdef POWER_10
-    else if (arch == POWER_10) return CPU_POWER10;
+    else if (arch >= POWER_10) return CPU_POWER10;
 #endif
    return CPU_UNKNOWN;
 }
@ -332,6 +339,9 @@ void gotoblas_dynamic_init(void) {
 	if (gotoblas && gotoblas -> init) {
 		strncpy(coren,gotoblas_corename(),20);
 		sprintf(coremsg, "Core: %s\n",coren);
+		if (getenv("GET_OPENBLAS_CORETYPE")) {
+			fprintf(stderr, "%s", coremsg);
+		}
 		openblas_warning(2, coremsg);
 		gotoblas -> init();
 	} else {
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@ -3214,7 +3214,7 @@ void blas_shutdown(void){
 #endif
    memory[pos].lock   = 0;
  }
-  if (memory_overflowed)
+  if (memory_overflowed) {
    for (pos = 0; pos < NEW_BUFFERS; pos ++){
      newmemory[pos].addr   = (void *)0;
      newmemory[pos].used   = 0;
@ -3222,6 +3222,10 @@ void blas_shutdown(void){
      newmemory[pos].pos    = -1;
 #endif
      newmemory[pos].lock   = 0;
+    }
+    free(newmemory);
+    newmemory = NULL;
+    memory_overflowed = 0;  
  }

  UNLOCK_COMMAND(&alloc_lock);
--- a/driver/others/openblas_set_num_threads.c
+++ b/driver/others/openblas_set_num_threads.c
@ -36,11 +36,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #ifdef SMP_SERVER

 extern  void openblas_set_num_threads(int num_threads) ;
+extern  int openblas_get_num_threads(void) ;

 void openblas_set_num_threads_(int* num_threads){
 	openblas_set_num_threads(*num_threads);
 }

+int openblas_set_num_threads_local(int num_threads){
+	int ret = openblas_get_num_threads();
+	openblas_set_num_threads(num_threads);
+	blas_omp_threads_local=num_threads;
+	return ret;
+}
+
+
 #else
 //Single thread

@ -50,4 +59,8 @@ void openblas_set_num_threads(int num_threads) {
 void openblas_set_num_threads_(int* num_threads){

 }
+
+int openblas_set_num_threads_local(int num_threads){
+	return 1;
+}
 #endif
--- a/exports/gensymbol
+++ b/exports/gensymbol
@ -60,6 +60,7 @@ cblasobjsc="
    cblas_ctbsv cblas_ctpmv cblas_ctpsv cblas_ctrmm cblas_ctrmv cblas_ctrsm cblas_ctrsv
    cblas_scnrm2 cblas_scasum cblas_cgemmt
    cblas_icamax cblas_icamin cblas_icmin cblas_icmax cblas_scsum cblas_cimatcopy cblas_comatcopy
+    cblas_caxpyc cblas_crotg cblas_csrot cblas_scamax cblas_scamin
    "
 cblasobjsd="
    cblas_dasum cblas_daxpy cblas_dcopy cblas_ddot
@ -69,6 +70,7 @@ cblasobjsd="
    cblas_dsyr2k cblas_dsyr cblas_dsyrk cblas_dtbmv cblas_dtbsv cblas_dtpmv cblas_dtpsv
    cblas_dtrmm cblas_dtrmv cblas_dtrsm cblas_dtrsv cblas_daxpby cblas_dgeadd cblas_dgemmt
    cblas_idamax cblas_idamin cblas_idmin cblas_idmax cblas_dsum cblas_dimatcopy cblas_domatcopy
+    cblas_damax  cblas_damin
    "

 cblasobjss="
@ -80,6 +82,7 @@ cblasobjss="
    cblas_stbmv cblas_stbsv cblas_stpmv cblas_stpsv cblas_strmm cblas_strmv cblas_strsm
    cblas_strsv cblas_sgeadd cblas_sgemmt
    cblas_isamax cblas_isamin cblas_ismin cblas_ismax cblas_ssum cblas_simatcopy cblas_somatcopy
+    cblas_samax cblas_samin
    "

 cblasobjsz="
@ -91,6 +94,7 @@ cblasobjsz="
    cblas_ztrsv cblas_cdotc_sub cblas_cdotu_sub cblas_zdotc_sub cblas_zdotu_sub
    cblas_zaxpby cblas_zgeadd cblas_zgemmt
    cblas_izamax cblas_izamin cblas_izmin cblas_izmax cblas_dzsum cblas_zimatcopy cblas_zomatcopy
+    cblas_zaxpyc cblas_zdrot cblas_zrotg cblas_dzamax cblas_dzamin
 "

 cblasobjs="cblas_xerbla"
@ -861,6 +865,53 @@ lapackobjs2z="$lapackobjs2z
    zgedmd
    zgedmdq
    "
+
+#functions added post 3.11
+
+lapackobjs2c="$lapackobjs2c
+    claqp2rk
+    claqp3rk
+    ctrsyl3
+    "
+#    claqz0
+#    claqz1
+#    claqz2
+#    claqz3
+#    clatrs3
+
+lapackobjs2d="$lapackobjs2d
+    dgelqs
+    dgelst
+    dgeqp3rk
+    dgeqrs
+    dlaqp2rk
+    dlaqp3rk
+    dlarmm
+    dlatrs3
+    dtrsyl3
+    "
+#    dlaqz0
+#    dlaqz1
+#    dlaqz2
+#    dlaqz3
+#    dlaqz4
+
+lapackobjs2z="$lapackobjs2z
+    zgelqs
+    zgelst
+    zgeqp3rk
+    zgeqrs
+    zlaqp2rk
+    zlaqp3rk
+    zlatrs3
+    zrscl
+    ztrsyl3
+    "
+#    zlaqz0
+#    zlaqz1
+#    zlaqz2
+#    zlaqz3
+
 lapack_extendedprecision_objs="
    zposvxx clagge clatms chesvxx cposvxx cgesvxx ssyrfssx csyrfsx
    dlagsy dsysvxx sporfsx slatms zlatms zherfsx csysvxx
@ -1622,6 +1673,14 @@ lapackeobjsc="
    LAPACKE_cgetsqrhrt_work
    LAPACKE_cungtsqr_row
    LAPACKE_cungtsqr_row_work
+    LAPACKE_clangb
+    LAPACKE_clangb_work
+    LAPACKE_ctrsyl3
+    LAPACKE_ctrsyl3_work
+    LAPACKE_ctz_nancheck
+    LAPACKE_ctz_trans
+    LAPACKE_cunhr_col
+    LAPACKE_cunhr_col_work
 "

 lapackeobjsd="
@ -2239,6 +2298,14 @@ lapackeobjsd="
    LAPACKE_dgetsqrhrt_work
    LAPACKE_dorgtsqr_row
    LAPACKE_dorgtsqr_row_work
+    LAPACKE_dlangb
+    LAPACKE_dlangb_work
+    LAPACKE_dorhr_col
+    LAPACKE_dorhr_col_work
+    LAPACKE_dtrsyl3
+    LAPACKE_dtrsyl3_work
+    LAPACKE_dtz_nancheck
+    LAPACKE_dtz_trans
 "

 lapackeobjss="
@ -2848,6 +2915,14 @@ lapackeobjss="
    LAPACKE_sgetsqrhrt_work
    LAPACKE_sorgtsqr_row
    LAPACKE_sorgtsqr_row_work
+    LAPACKE_slangb
+    LAPACKE_slangb_work
+    LAPACKE_sorhr_col
+    LAPACKE_sorhr_col_work
+    LAPACKE_strsyl3
+    LAPACKE_strsyl3_work
+    LAPACKE_stz_nancheck
+    LAPACKE_stz_trans
 "

 lapackeobjsz="
@ -3515,6 +3590,14 @@ lapackeobjsz="
    LAPACKE_zgetsqrhrt_work
    LAPACKE_zungtsqr_row
    LAPACKE_zungtsqr_row_work
+    LAPACKE_zlangb
+    LAPACKE_zlangb_work
+    LAPACKE_ztrsyl3
+    LAPACKE_ztrsyl3_work
+    LAPACKE_ztz_nancheck
+    LAPACKE_ztz_trans
+    LAPACKE_zunhr_col
+    LAPACKE_zunhr_col_work
 "
 ## @(SRCX_OBJ) from `lapack-3.4.1/lapacke/src/Makefile`
 ## Not exported: requires LAPACKE_EXTENDED to be set and depends on the
@ -3616,6 +3699,7 @@ lapack_embeded_underscore_objs_s="
    ssysv_aa_2stage ssytrf_aa_2stage
    ssytrs_aa_2stage
    slaorhr_col_getrfnp slaorhr_col_getrfnp2 sorhr_col
+    slarfb_gett
 "
 lapack_embeded_underscore_objs_c="
    chetf2_rook chetrf_rook chetri_rook
@ -3641,6 +3725,7 @@ lapack_embeded_underscore_objs_c="
    csysv_aa_2stage csytrf_aa_2stage
    csytrs_aa_2stage
    claunhr_col_getrfnp claunhr_col_getrfnp2 cunhr_col
+    clarfb_gett
 "
 lapack_embeded_underscore_objs_d="
    dlasyf_rook
@ -3658,6 +3743,7 @@ lapack_embeded_underscore_objs_d="
     dsysv_aa_2stage
    dsytrf_aa_2stage dsytrs_aa_2stage
    dlaorhr_col_getrfnp dlaorhr_col_getrfnp2 dorhr_col
+    dlarfb_gett
 "
 lapack_embeded_underscore_objs_z="
    zhetf2_rook zhetrf_rook zhetri_rook
@ -3682,6 +3768,7 @@ lapack_embeded_underscore_objs_z="
    zhetrs_aa_2stage zsysv_aa_2stage
    zsytrf_aa_2stage zsytrs_aa_2stage
    zlaunhr_col_getrfnp zlaunhr_col_getrfnp2 zunhr_col
+    zlarfb_gett
 "

 dirname=`pwd -P`/../lapack-netlib
--- a/10
+++ b/10
@ -45,7 +45,7 @@ if [ -z "$compiler" ]; then
 	      pathf90 pathf95
 	      pgf95 pgf90 pgf77 pgfortran nvfortran
 	      flang egfortran
-              ifort nagfor ifx ftn crayftn"
+              ifort nagfor ifx ftn crayftn armflang"

    for list in $lists; do
        for p in $path; do
@ -85,7 +85,11 @@ else
 	    *Hewlett*)
 		vendor=CRAY
 		openmp='-fopenmp'
-		;;		
+		;;
+   	    *Arm\ F90*)
+		vendor=FLANG
+		openmp='-fopenmp'
+		;;	
            *GNU*|*GCC*)

                v="${data#*GCC: *\) }"
@ -108,7 +112,7 @@ else
                	    if [ "$major" -ge 17 ]; then
                        	vendor=FLANGNEW
 			    fi	
-			;;
+			    ;;
                        *ifort*|*ifx*)
                            vendor=INTEL
                            openmp='-fopenmp'
--- a/getarch.c
+++ b/getarch.c
@ -150,6 +150,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 /* #define FORCE_EV4		*/
 /* #define FORCE_EV5		*/
 /* #define FORCE_EV6		*/
+/* #define FORCE_CSKY		*/
+/* #define FORCE_CK860FV		*/
 /* #define FORCE_GENERIC	*/

 #ifdef FORCE_P2
@ -1677,9 +1679,46 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define LIBNAME   "c910v"
 #define CORENAME  "C910V"
 #endif
+#endif
+#ifdef FORCE_x280
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "x280"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-Dx280 " \
+       "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "x280"
+#define CORENAME  "x280"
 #else
 #endif

+#ifdef FORCE_RISCV64_ZVL256B
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_ZVL256B"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG   "-DRISCV64_ZVL256B " \
+       "-DL1_DATA_SIZE=64536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=262144 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME   "riscv64_zvl256b"
+#define CORENAME  "RISCV64_ZVL256B"
+#endif
+
+#ifdef FORCE_RISCV64_ZVL128B
+#define FORCE
+#define ARCHITECTURE    "RISCV64"
+#define SUBARCHITECTURE "RISCV64_ZVL128B"
+#define SUBDIRNAME      "riscv64"
+#define ARCHCONFIG "-DRISCV64_ZVL128B "                          \
+                   "-DL1_DATA_SIZE=32768 -DL1_DATA_LINESIZE=32 " \
+                   "-DL2_SIZE=1048576 -DL2_LINESIZE=32 "         \
+                   "-DDTB_DEFAULT_ENTRIES=128 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=4 "
+#define LIBNAME  "riscv64_zvl128b"
+#define CORENAME "RISCV64_ZVL128B"
+#endif

 #if defined(FORCE_E2K) || defined(__e2k__)
 #define FORCE
@ -1692,6 +1731,33 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CORENAME  "generic"
 #endif

+#ifdef FORCE_CSKY
+#define FORCE
+#define ARCHITECTURE    "CSKY"
+#define SUBARCHITECTURE "CSKY"
+#define SUBDIRNAME      "csky"
+#define ARCHCONFIG   "-DCSKY" \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "csky"
+#define CORENAME  "CSKY"
+#endif
+
+#ifdef FORCE_CK860FV
+#define FORCE
+#define ARCHITECTURE    "CSKY"
+#define SUBARCHITECTURE "CK860V"
+#define SUBDIRNAME      "csky"
+#define ARCHCONFIG   "-DCK860FV " \
+       "-DL1_DATA_SIZE=65536 -DL1_DATA_LINESIZE=32 " \
+       "-DL2_SIZE=524288 -DL2_LINESIZE=32 " \
+       "-DDTB_DEFAULT_ENTRIES=64 -DDTB_SIZE=4096 -DL2_ASSOCIATIVE=8 "
+#define LIBNAME   "ck860fv"
+#define CORENAME  "CK860FV"
+#endif
+
+
 #ifndef FORCE

 #ifdef USER_TARGET
@ -1766,7 +1832,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define OPENBLAS_SUPPORTED
 #endif

-
 #ifndef OPENBLAS_SUPPORTED
 #error "This arch/CPU is not supported by OpenBLAS."
 #endif
@ -1831,7 +1896,7 @@ int main(int argc, char *argv[]){
 #ifdef FORCE
    printf("CORE=%s\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__alpha__) || defined(__csky__)
    printf("CORE=%s\n", get_corename());
 #endif
 #endif
@ -1979,7 +2044,7 @@ printf("ELF_VERSION=2\n");
 #ifdef FORCE
    printf("#define CHAR_CORENAME \"%s\"\n", CORENAME);
 #else
-#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv)
+#if defined(INTEL_AMD) || defined(POWER) || defined(__mips__) || defined(__arm__) || defined(__aarch64__) || defined(ZARCH) || defined(sparc) || defined(__loongarch__) || defined(__riscv) || defined(__csky__)
    printf("#define CHAR_CORENAME \"%s\"\n", get_corename());
 #endif
 #endif
--- a/interface/CMakeLists.txt
+++ b/interface/CMakeLists.txt
@ -119,6 +119,7 @@ endif ()
 if (BUILD_BFLOAT16)
 	GenerateNamedObjects("bf16dot.c" "" "sbdot" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("gemm.c" "" "sbgemm" ${CBLAS_FLAG} "" "" true "BFLOAT16")
+	GenerateNamedObjects("gemmt.c" "" "sbgemmt" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("sbgemv.c" "" "sbgemv" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "SINGLE_PREC" "sbstobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
 	GenerateNamedObjects("tobf16.c" "DOUBLE_PREC" "sbdtobf16" ${CBLAS_FLAG} "" "" true "BFLOAT16")
@ -130,6 +131,8 @@ endif ()
 foreach (float_type ${FLOAT_TYPES})

  if (${float_type} STREQUAL "COMPLEX" OR ${float_type} STREQUAL "ZCOMPLEX")
+    GenerateNamedObjects("zaxpy.c" "" "axpyc" ${CBLAS_FLAG} "" "" false ${float_type})
+
    GenerateNamedObjects("zger.c" "" "geru" ${CBLAS_FLAG} "" "" false ${float_type})
    GenerateNamedObjects("zger.c" "CONJ" "gerc" ${CBLAS_FLAG} "" "" false ${float_type})
    GenerateNamedObjects("zdot.c" "CONJ" "dotc" ${CBLAS_FLAG} "" "" false ${float_type})
--- a/interface/Makefile
+++ b/interface/Makefile
@ -270,7 +270,8 @@ CSBLAS1OBJS   = \
 	cblas_scopy.$(SUFFIX) cblas_sdot.$(SUFFIX) cblas_sdsdot.$(SUFFIX) cblas_dsdot.$(SUFFIX) \
 	cblas_srot.$(SUFFIX) cblas_srotg.$(SUFFIX) cblas_srotm.$(SUFFIX) cblas_srotmg.$(SUFFIX) \
 	cblas_sscal.$(SUFFIX) cblas_sswap.$(SUFFIX) cblas_snrm2.$(SUFFIX) cblas_saxpby.$(SUFFIX) \
-	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX)
+	cblas_ismin.$(SUFFIX) cblas_ismax.$(SUFFIX) cblas_ssum.$(SUFFIX) cblas_samax.$(SUFFIX) \
+	cblas_samin.$(SUFFIX)

 CSBLAS2OBJS   = \
 	cblas_sgemv.$(SUFFIX) cblas_sger.$(SUFFIX) cblas_ssymv.$(SUFFIX) cblas_strmv.$(SUFFIX) \
@ -295,7 +296,8 @@ CDBLAS1OBJS   = \
 	cblas_dcopy.$(SUFFIX) cblas_ddot.$(SUFFIX) \
 	cblas_drot.$(SUFFIX) cblas_drotg.$(SUFFIX) cblas_drotm.$(SUFFIX) cblas_drotmg.$(SUFFIX) \
 	cblas_dscal.$(SUFFIX) cblas_dswap.$(SUFFIX) cblas_dnrm2.$(SUFFIX) cblas_daxpby.$(SUFFIX) \
-	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX)
+	cblas_idmin.$(SUFFIX) cblas_idmax.$(SUFFIX) cblas_dsum.$(SUFFIX) cblas_damax.$(SUFFIX) \
+	cblas_damin.$(SUFFIX)

 CDBLAS2OBJS   = \
 	cblas_dgemv.$(SUFFIX) cblas_dger.$(SUFFIX) cblas_dsymv.$(SUFFIX) cblas_dtrmv.$(SUFFIX) \
@ -315,7 +317,7 @@ CCBLAS1OBJS   = \
 	cblas_cdotc_sub.$(SUFFIX) cblas_cdotu_sub.$(SUFFIX) \
 	cblas_cscal.$(SUFFIX) cblas_csscal.$(SUFFIX) \
 	cblas_cswap.$(SUFFIX) cblas_scnrm2.$(SUFFIX) \
-	cblas_caxpby.$(SUFFIX) \
+	cblas_caxpby.$(SUFFIX) cblas_scamax.$(SUFFIX) cblas_caxpyc.$(SUFFIX) cblas_scamin.$(SUFFIX) \
 	cblas_icmin.$(SUFFIX) cblas_icmax.$(SUFFIX) cblas_scsum.$(SUFFIX) cblas_csrot.$(SUFFIX) cblas_crotg.$(SUFFIX)

 CCBLAS2OBJS   = \
@ -340,12 +342,12 @@ CXERBLAOBJ = \

 CZBLAS1OBJS   = \
 	cblas_izamax.$(SUFFIX) cblas_izamin.$(SUFFIX) cblas_dzasum.$(SUFFIX)  cblas_zaxpy.$(SUFFIX) \
-	cblas_zcopy.$(SUFFIX) \
+	cblas_zcopy.$(SUFFIX) cblas_dzamax.$(SUFFIX) cblas_dzamin.$(SUFFIX) \
 	cblas_zdotc.$(SUFFIX) cblas_zdotu.$(SUFFIX) \
 	cblas_zdotc_sub.$(SUFFIX) cblas_zdotu_sub.$(SUFFIX) \
 	cblas_zscal.$(SUFFIX) cblas_zdscal.$(SUFFIX) \
 	cblas_zswap.$(SUFFIX) cblas_dznrm2.$(SUFFIX) \
-	cblas_zaxpby.$(SUFFIX) \
+	cblas_zaxpby.$(SUFFIX) cblas_zaxpyc.$(SUFFIX) \
 	cblas_izmin.$(SUFFIX) cblas_izmax.$(SUFFIX) cblas_dzsum.$(SUFFIX) cblas_zdrot.$(SUFFIX) cblas_zrotg.$(SUFFIX)


@ -1301,7 +1303,7 @@ xhpr2.$(SUFFIX) xhpr2.$(PSUFFIX) : zhpr2.c
 ifeq ($(BUILD_BFLOAT16),1)
 sbgemm.$(SUFFIX) sbgemm.$(PSUFFIX) : gemm.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
-sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
+sbgemmt.$(SUFFIX) sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -c $(CFLAGS) $< -o $(@F)
 endif

@ -1533,6 +1535,30 @@ cblas_icmin.$(SUFFIX) cblas_icmin.$(PSUFFIX) : imax.c
 cblas_izmin.$(SUFFIX) cblas_izmin.$(PSUFFIX) : imax.c
 	$(CC) $(CFLAGS) -DCBLAS -c -UUSE_ABS -DUSE_MIN $< -o $(@F)

+cblas_samax.$(SUFFIX) cblas_samax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_damax.$(SUFFIX) cblas_damax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_scamax.$(SUFFIX) cblas_scamax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_dzamax.$(SUFFIX) cblas_dzamax.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -UUSE_MIN $< -o $(@F)
+
+cblas_samin.$(SUFFIX) cblas_samin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_damin.$(SUFFIX) cblas_damin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_scamin.$(SUFFIX) cblas_scamin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
+cblas_dzamin.$(SUFFIX) cblas_dzamin.$(PSUFFIX) : max.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DUSE_ABS -DUSE_MIN $< -o $(@F)
+
 cblas_sasum.$(SUFFIX) cblas_sasum.$(PSUFFIX) : asum.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

@ -1627,6 +1653,15 @@ cblas_daxpy.$(SUFFIX) cblas_daxpy.$(PSUFFIX) : axpy.c
 cblas_caxpy.$(SUFFIX) cblas_caxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

+cblas_caxpyc.$(SUFFIX) cblas_caxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_zaxpyc.$(SUFFIX) cblas_zaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
+cblas_xaxpyc.$(SUFFIX) cblas_xaxpyc.$(PSUFFIX) : zaxpy.c
+	$(CC) $(CFLAGS) -DCBLAS -c -DCONJ $< -o $(@F)
+
 cblas_zaxpy.$(SUFFIX) cblas_zaxpy.$(PSUFFIX) : zaxpy.c
 	$(CC) $(CFLAGS) -DCBLAS -c $< -o $(@F)

@ -1932,7 +1967,7 @@ cblas_sgemmt.$(SUFFIX) cblas_sgemmt.$(PSUFFIX) : gemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)

 ifeq ($(BUILD_BFLOAT16),1)
-cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : gemmt.c ../param.h
+cblas_sbgemmt.$(SUFFIX) cblas_sbgemmt.$(PSUFFIX) : sbgemmt.c ../param.h
 	$(CC) -DCBLAS -c $(CFLAGS) $< -o $(@F)
 endif

--- a/interface/gemmt.c
+++ b/interface/gemmt.c
@ -78,6 +78,9 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,

 	char transA, transB, Uplo;
 	blasint nrowa, nrowb;
+#if defined(COMPLEX)
+	blasint ncolb;
+#endif
 	IFLOAT *buffer;
 	IFLOAT *aa, *bb;
 	FLOAT *cc;
@ -155,19 +158,27 @@ void NAME(char *UPLO, char *TRANSA, char *TRANSB,
 		uplo = 0;
 	if (Uplo == 'L')
 		uplo = 1;
-
+	
 	nrowa = m;
-	if (transa) nrowa = k;
+	if (transa & 1) nrowa = k;
 	nrowb = k;
-	if (transb) nrowb = m;
+#if defined(COMPLEX)
+	ncolb = m;
+#endif
+	if (transb & 1) {
+		nrowb = m;
+#if defined(COMPLEX)
+		ncolb = k;
+#endif
+	}

 	info = 0;

 	if (ldc < MAX(1, m))
 		info = 13;
-	if (ldb < MAX(1, nrowa))
+	if (ldb < MAX(1, nrowb))
 		info = 10;
-	if (lda < MAX(1, nrowb))
+	if (lda < MAX(1, nrowa))
 		info = 8;
 	if (k < 0)
 		info = 5;
@ -211,6 +222,9 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 	blasint info;
 	blasint lda, ldb;
 	FLOAT *a, *b;
+#if defined(COMPLEX)
+	blasint nrowb, ncolb;
+#endif
 	XFLOAT *buffer;

 	PRINT_DEBUG_CNAME;
@ -262,11 +276,22 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,

 		info = -1;

-		blasint nrowa, nrowb;
+		blasint nrowa;
+#if !defined(COMPLEX)
+		blasint nrowb;
+#endif
 		nrowa = m;
-		if (transa) nrowa = k;
+		if (transa & 1) nrowa = k;
 		nrowb = k;
-		if (transb) nrowb = m;
+#if defined(COMPLEX)
+		ncolb = m;
+#endif
+		if (transb & 1) {
+			nrowb = m;
+#if defined(COMPLEX)
+			ncolb = k;
+#endif
+		}

 		if (ldc < MAX(1, m))
 			info = 13;
@ -330,26 +355,38 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,

 		info = -1;

-		blasint ncola, ncolb;
-		ncola = k;
-		if (transa) ncola = m;
-		ncolb = m;
-		if (transb) ncolb = k;
+		blasint ncola; 
+#if !defined(COMPLEX)
+		blasint ncolb;
+#endif
+		ncola = m;
+		if (transa & 1) ncola = k;
+		ncolb = k;
+#if defined(COMPLEX)
+		nrowb = m;
+#endif
+
+		if (transb & 1) {
+#if defined(COMPLEX)
+			nrowb = k;
+#endif
+			ncolb = m;
+		}

 		if (ldc < MAX(1,m))
 			info = 13;
 		if (ldb < MAX(1, ncolb))
-			info = 10;
-		if (lda < MAX(1, ncola))
 			info = 8;
+		if (lda < MAX(1, ncola))
+			info = 10;
 		if (k < 0)
 			info = 5;
 		if (m < 0)
 			info = 4;
 		if (transb < 0)
-			info = 3;
-		if (transa < 0)
 			info = 2;
+		if (transa < 0)
+			info = 3;
 		if (uplo < 0)
 			info = 1;
 	}
@ -428,7 +465,20 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,

 	IDEBUG_START;

-	const blasint incb = (transb == 0) ? 1 : ldb;
+#if defined(COMPLEX)
+	if (transb > 1){
+#ifndef CBLAS
+		IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+#else
+		if (order == CblasColMajor)
+			IMATCOPY_K_CNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+		if (order == CblasRowMajor)
+			IMATCOPY_K_RNC(nrowb, ncolb, (FLOAT)(1.0), (FLOAT)(0.0), b, ldb);
+#endif
+	}
+#endif
+
+	const blasint incb = ((transb & 1) == 0) ? 1 : ldb;

 	if (uplo == 1) {
 		for (i = 0; i < m; i++) {
@ -438,19 +488,19 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #if defined(COMPLEX)
 			aa = a + i * 2;
 			bb = b + i * ldb * 2;
-			if (transa) {
+			if (transa & 1) {
 				aa = a + lda * i * 2;
 			}
-			if (transb)
+			if (transb & 1)
 				bb = b + i * 2;
 			cc = c + i * 2 * ldc + i * 2;
 #else
 			aa = a + i;
 			bb = b + i * ldb;
-			if (transa) {
+			if (transa & 1) {
 				aa = a + lda * i;
 			}
-			if (transb)
+			if (transb & 1)
 				bb = b + i;
 			cc = c + i * ldc + i;
 #endif
@ -461,7 +511,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 				       NULL, 0);

 			if (alpha_r == ZERO && alpha_i == ZERO)
-				return;
+				continue;
 #else
 			if (beta != ONE)
 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@ -478,7 +528,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			// for alignment
 			buffer_size = (buffer_size + 3) & ~3;
-			STACK_ALLOC(buffer_size, FLOAT, buffer);
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);

 #ifdef SMP

@ -491,7 +541,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif

 #if defined(COMPLEX)
-				if (!transa)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     aa, lda, bb, incb, cc, 1,
 						     buffer);
@ -500,7 +550,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 						     aa, lda, bb, incb, cc, 1,
 						     buffer);
 #else
-				if (!transa)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha, aa, lda,
 						     bb, incb, cc, 1, buffer);
 				else
@ -509,7 +559,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 #ifdef SMP
 			} else {
-				if (!transa)
+				if (!(transa & 1))
 				(gemv_thread[(int)transa]) (j, k, alpha, aa,
 							    lda, bb, incb, cc,
 							    1, buffer,
@ -533,13 +583,13 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 			l = j;
 #if defined COMPLEX
 			bb = b + i * ldb * 2;
-			if (transb) {
+			if (transb & 1) {
 				bb = b + i * 2;
 			}
 			cc = c + i * 2 * ldc;
 #else
 			bb = b + i * ldb;
-			if (transb) {
+			if (transb & 1) {
 				bb = b + i;
 			}
 			cc = c + i * ldc;
@ -551,7 +601,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 				       NULL, 0);

 			if (alpha_r == ZERO && alpha_i == ZERO)
-				return;
+				continue;
 #else
 			if (beta != ONE)
 				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
@ -567,7 +617,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif
 			// for alignment
 			buffer_size = (buffer_size + 3) & ~3;
-			STACK_ALLOC(buffer_size, FLOAT, buffer);
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);

 #ifdef SMP

@ -580,7 +630,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 #endif

 #if defined(COMPLEX)
-				if (!transa)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha_r, alpha_i,
 						     a, lda, bb, incb, cc, 1,
 						     buffer);
@ -589,7 +639,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
 						     a, lda, bb, incb, cc, 1,
 						     buffer);
 #else
-				if (!transa)
+				if (!(transa & 1))
 				(gemv[(int)transa]) (j, k, 0, alpha, a, lda, bb,
 						     incb, cc, 1, buffer);
 				else
@ -599,7 +649,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,

 #ifdef SMP
 			} else {
-				if (!transa)
+				if (!(transa & 1))
 				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
 							    bb, incb, cc, 1,
 							    buffer, nthreads);
--- a/interface/gemv.c
+++ b/interface/gemv.c
@ -226,7 +226,7 @@ void CNAME(enum CBLAS_ORDER order,

 #ifdef SMP

-  if ( 1L * m * n < 2304L * GEMM_MULTITHREAD_THRESHOLD )
+  if ( 1L * m * n < 115200L * GEMM_MULTITHREAD_THRESHOLD )
    nthreads = 1;
  else
    nthreads = num_cpu_avail(2);
--- a/interface/imatcopy.c
+++ b/interface/imatcopy.c
@ -154,7 +154,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
    }
 #endif

-	msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT);
+	if ( *rows >  *cols )
+            msize = (size_t)(*rows) * (*ldb)  * sizeof(FLOAT);
+    else
+            msize = (size_t)(*cols) * (*ldb)  * sizeof(FLOAT);

 	b = malloc(msize);
 	if ( b == NULL )
--- a/interface/lapack/gesv.c
+++ b/interface/lapack/gesv.c
@ -114,7 +114,14 @@ int NAME(blasint *N, blasint *NRHS, FLOAT *a, blasint *ldA, blasint *ipiv,

 #ifdef SMP
  args.common = NULL;
-  args.nthreads = num_cpu_avail(4);
+#ifndef DOUBLE
+  if (args.m*args.n < 40000)
+#else
+  if (args.m*args.n < 10000)
+#endif
+	args.nthreads=1;
+  else
+         args.nthreads = num_cpu_avail(4);

  if (args.nthreads == 1) {
 #endif
--- a/interface/max.c
+++ b/interface/max.c
@ -145,8 +145,13 @@ FLOATRET NAME(blasint *N, FLOAT *x, blasint *INCX){

 #else

+#ifdef COMPLEX
+FLOAT CNAME(blasint n, void *vx, blasint incx){
+  FLOAT *x = (FLOAT*) vx;
+#else
 FLOAT CNAME(blasint n, FLOAT *x, blasint incx){
-
+#endif
+  
  FLOAT ret;

  PRINT_DEBUG_CNAME;
--- a/interface/rotmg.c
+++ b/interface/rotmg.c
@ -96,12 +96,6 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
 	else
 	{
 		dp2 = *dd2 * dy1;
-		if(dp2 == ZERO)
-		{
-			dflag = -TWO;
-			dparam[0] = dflag;
-			return;
-		}
 		dp1 = *dd1 * *dx1;
 		dq2 =  dp2 * dy1;
 		dq1 =  dp1 * *dx1;
@ -113,24 +107,10 @@ void CNAME(FLOAT *dd1, FLOAT *dd2, FLOAT *dx1, FLOAT dy1, FLOAT *dparam){
 			dh12 =    dp2 /  dp1;

 			du   = ONE - dh12 * dh21;
-			if(du > ZERO)
-			{
-				dflag = ZERO;
-				*dd1  = *dd1 / du;
-				*dd2  = *dd2 / du;
-				*dx1  = *dx1 * du;
-			} else {
-				dflag = -ONE;
-
-				dh11  = ZERO;
-				dh12  = ZERO;
-				dh21  = ZERO;
-				dh22  = ZERO;
-
-				*dd1  = ZERO;
-				*dd2  = ZERO;
-				*dx1  = ZERO;
-			}
+			dflag = ZERO;
+			*dd1  = *dd1 / du;
+			*dd2  = *dd2 / du;
+			*dx1  = *dx1 * du;
 			
 		}
 		else
--- a/interface/sbgemmt.c
+++ b/interface/sbgemmt.c
@ -0,0 +1,447 @@
+/*********************************************************************/
+/* Copyright 2024, The OpenBLAS Project.                             */
+/* All rights reserved.                                              */
+/*                                                                   */
+/* Redistribution and use in source and binary forms, with or        */
+/* without modification, are permitted provided that the following   */
+/* conditions are met:                                               */
+/*                                                                   */
+/*   1. Redistributions of source code must retain the above         */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer.                                                  */
+/*                                                                   */
+/*   2. Redistributions in binary form must reproduce the above      */
+/*      copyright notice, this list of conditions and the following  */
+/*      disclaimer in the documentation and/or other materials       */
+/*      provided with the distribution.                              */
+/*                                                                   */
+/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
+/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
+/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
+/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
+/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
+/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
+/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
+/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
+/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
+/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
+/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
+/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
+/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
+/*    POSSIBILITY OF SUCH DAMAGE.                                    */
+/*                                                                   */
+/*********************************************************************/
+
+#include <stdio.h>
+#include <stdlib.h>
+#include "common.h"
+
+#define SMP_THRESHOLD_MIN 65536.0
+#define ERROR_NAME "SBGEMMT "
+
+#ifndef GEMM_MULTITHREAD_THRESHOLD
+#define GEMM_MULTITHREAD_THRESHOLD 4
+#endif
+
+#ifndef CBLAS
+
+void NAME(char *UPLO, char *TRANSA, char *TRANSB,
+	  blasint * M, blasint * K,
+	  FLOAT * Alpha,
+	  IFLOAT * a, blasint * ldA,
+	  IFLOAT * b, blasint * ldB, FLOAT * Beta, FLOAT * c, blasint * ldC)
+{
+
+	blasint m, k;
+	blasint lda, ldb, ldc;
+	int transa, transb, uplo;
+	blasint info;
+
+	char transA, transB, Uplo;
+	blasint nrowa, nrowb;
+	IFLOAT *buffer;
+	IFLOAT *aa, *bb;
+	FLOAT *cc;
+	FLOAT alpha, beta;
+
+	PRINT_DEBUG_NAME;
+
+	m = *M;
+	k = *K;
+
+	alpha = *Alpha;
+	beta = *Beta;
+
+	lda = *ldA;
+	ldb = *ldB;
+	ldc = *ldC;
+
+	transA = *TRANSA;
+	transB = *TRANSB;
+	Uplo = *UPLO;
+	TOUPPER(transA);
+	TOUPPER(transB);
+	TOUPPER(Uplo);
+
+	transa = -1;
+	transb = -1;
+	uplo = -1;
+
+	if (transA == 'N')
+		transa = 0;
+	if (transA == 'T')
+		transa = 1;
+
+	if (transA == 'R')
+		transa = 0;
+	if (transA == 'C')
+		transa = 1;
+
+	if (transB == 'N')
+		transb = 0;
+	if (transB == 'T')
+		transb = 1;
+
+	if (transB == 'R')
+		transb = 0;
+	if (transB == 'C')
+		transb = 1;
+
+	if (Uplo == 'U')
+		uplo = 0;
+	if (Uplo == 'L')
+		uplo = 1;
+	nrowa = m;
+	if (transa & 1) nrowa = k;
+	nrowb = k;
+	if (transb & 1) nrowb = m;
+
+	info = 0;
+
+	if (ldc < MAX(1, m))
+		info = 13;
+	if (ldb < MAX(1, nrowb))
+		info = 10;
+	if (lda < MAX(1, nrowa))
+		info = 8;
+	if (k < 0)
+		info = 5;
+	if (m < 0)
+		info = 4;
+	if (transb < 0)
+		info = 3;
+	if (transa < 0)
+		info = 2;
+	if (uplo < 0)
+		info = 1;
+
+	if (info != 0) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+#else
+
+void CNAME(enum CBLAS_ORDER order, enum CBLAS_UPLO Uplo,
+	   enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANSPOSE TransB, blasint m,
+	   blasint k,
+	   FLOAT alpha,
+	   IFLOAT * A, blasint LDA,
+	   IFLOAT * B, blasint LDB, FLOAT beta, FLOAT * c, blasint ldc)
+{
+	IFLOAT *aa, *bb;
+        FLOAT *cc;
+
+	int transa, transb, uplo;
+	blasint info;
+	blasint lda, ldb;
+	IFLOAT *a, *b;
+	XFLOAT *buffer;
+
+	PRINT_DEBUG_CNAME;
+
+	uplo = -1;
+	transa = -1;
+	transb = -1;
+	info = 0;
+
+	if (order == CblasColMajor) {
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
+		if (TransA == CblasNoTrans)
+			transa = 0;
+		if (TransA == CblasTrans)
+			transa = 1;
+
+		if (TransA == CblasConjNoTrans)
+			transa = 0;
+		if (TransA == CblasConjTrans)
+			transa = 1;
+
+		if (TransB == CblasNoTrans)
+			transb = 0;
+		if (TransB == CblasTrans)
+			transb = 1;
+
+		if (TransB == CblasConjNoTrans)
+			transb = 0;
+		if (TransB == CblasConjTrans)
+			transb = 1;
+
+		a = (void *)A;
+		b = (void *)B;
+		lda = LDA;
+		ldb = LDB;
+
+		info = -1;
+
+		blasint nrowa;
+		blasint nrowb;
+		nrowa = m;
+		if (transa & 1) nrowa = k;
+		nrowb = k;
+		if (transb & 1)  nrowb = m;
+
+		if (ldc < MAX(1, m))
+			info = 13;
+		if (ldb < MAX(1, nrowb))
+			info = 10;
+		if (lda < MAX(1, nrowa))
+			info = 8;
+		if (k < 0)
+			info = 5;
+		if (m < 0)
+			info = 4;
+		if (transb < 0)
+			info = 3;
+		if (transa < 0)
+			info = 2;
+		if (uplo < 0)
+			info = 1;
+	}
+
+	if (order == CblasRowMajor) {
+
+		a = (void *)B;
+		b = (void *)A;
+
+		lda = LDB;
+		ldb = LDA;
+
+		if (Uplo == CblasUpper) uplo = 0;
+		if (Uplo == CblasLower) uplo = 1;
+
+		if (TransB == CblasNoTrans)
+			transa = 0;
+		if (TransB == CblasTrans)
+			transa = 1;
+
+		if (TransB == CblasConjNoTrans)
+			transa = 0;
+		if (TransB == CblasConjTrans)
+			transa = 1;
+
+		if (TransA == CblasNoTrans)
+			transb = 0;
+		if (TransA == CblasTrans)
+			transb = 1;
+
+		if (TransA == CblasConjNoTrans)
+			transb = 0;
+		if (TransA == CblasConjTrans)
+			transb = 1;
+
+		info = -1;
+
+		blasint ncola; 
+		blasint ncolb;
+
+		ncola = m;
+		if (transa & 1) ncola = k;
+		ncolb = k;
+
+		if (transb & 1) {
+			ncolb = m;
+		}
+
+		if (ldc < MAX(1,m))
+			info = 13;
+		if (ldb < MAX(1, ncolb))
+			info = 8;
+		if (lda < MAX(1, ncola))
+			info = 10;
+		if (k < 0)
+			info = 5;
+		if (m < 0)
+			info = 4;
+		if (transb < 0)
+			info = 2;
+		if (transa < 0)
+			info = 3;
+		if (uplo < 0)
+			info = 1;
+	}
+
+	if (info >= 0) {
+		BLASFUNC(xerbla) (ERROR_NAME, &info, sizeof(ERROR_NAME));
+		return;
+	}
+
+#endif
+	int buffer_size;
+	blasint i, j;
+
+#ifdef SMP
+	int nthreads;
+#endif
+
+
+#ifdef SMP
+	static int (*gemv_thread[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *,
+				     BLASLONG, IFLOAT *, BLASLONG, FLOAT,
+				     FLOAT *, BLASLONG, int) = {
+		sbgemv_thread_n, sbgemv_thread_t,
+	};
+#endif
+	int (*gemv[]) (BLASLONG, BLASLONG, FLOAT, IFLOAT *, BLASLONG,
+		       IFLOAT *, BLASLONG, FLOAT, FLOAT *, BLASLONG) = {
+	SBGEMV_N, SBGEMV_T,};
+
+
+	if (m == 0)
+		return;
+
+	IDEBUG_START;
+
+	const blasint incb = ((transb & 1) == 0) ? 1 : ldb;
+
+	if (uplo == 1) {
+		for (i = 0; i < m; i++) {
+			j = m - i;
+
+			aa = a + i;
+			bb = b + i * ldb;
+			if (transa & 1) {
+				aa = a + lda * i;
+			}
+			if (transb & 1)
+				bb = b + i;
+			cc = c + i * ldc + i;
+
+#if 0
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+
+			IDEBUG_START;
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+				if (!(transa & 1))
+				(gemv[(int)transa]) (j, k, alpha, aa, lda,
+						     bb, incb, beta, cc, 1);
+				else
+				(gemv[(int)transa]) (k, j, alpha, aa, lda,
+						     bb, incb, beta, cc, 1);
+
+#ifdef SMP
+			} else {
+				if (!(transa & 1))
+				(gemv_thread[(int)transa]) (j, k, alpha, aa,
+							    lda, bb, incb, beta, cc,
+							    1, nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, aa,
+							    lda, bb, incb, beta, cc,
+							    1, nthreads);
+
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	} else {
+
+		for (i = 0; i < m; i++) {
+			j = i + 1;
+
+			bb = b + i * ldb;
+			if (transb & 1) {
+				bb = b + i;
+			}
+			cc = c + i * ldc;
+
+#if 0
+			if (beta != ONE)
+				SCAL_K(l, 0, 0, beta, cc, 1, NULL, 0, NULL, 0);
+
+			if (alpha == ZERO)
+				continue;
+#endif
+			IDEBUG_START;
+
+			buffer_size = j + k + 128 / sizeof(FLOAT);
+#ifdef WINDOWS_ABI
+			buffer_size += 160 / sizeof(FLOAT);
+#endif
+			// for alignment
+			buffer_size = (buffer_size + 3) & ~3;
+			STACK_ALLOC(buffer_size, IFLOAT, buffer);
+
+#ifdef SMP
+
+			if (1L * j * k < 2304L * GEMM_MULTITHREAD_THRESHOLD)
+				nthreads = 1;
+			else
+				nthreads = num_cpu_avail(2);
+
+			if (nthreads == 1) {
+#endif
+
+				if (!(transa & 1))
+				(gemv[(int)transa]) (j, k, alpha, a, lda, bb,
+						     incb, beta, cc, 1);
+				else
+				(gemv[(int)transa]) (k, j, alpha, a, lda, bb,
+						     incb, beta, cc, 1);
+
+#ifdef SMP
+			} else {
+				if (!(transa & 1))
+				(gemv_thread[(int)transa]) (j, k, alpha, a, lda,
+							    bb, incb, beta, cc, 1,
+							    nthreads);
+				else
+				(gemv_thread[(int)transa]) (k, j, alpha, a, lda,
+							    bb, incb, beta, cc, 1,
+							    nthreads);
+			}
+#endif
+
+			STACK_FREE(buffer);
+		}
+	}
+
+	IDEBUG_END;
+
+	return;
+}
--- a/interface/zaxpby.c
+++ b/interface/zaxpby.c
@ -39,12 +39,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef CBLAS

-void NAME(blasint *N, FLOAT *ALPHA, FLOAT *x, blasint *INCX, FLOAT *BETA, FLOAT *y, blasint *INCY)
+void NAME(blasint *N, void *VALPHA, FLOAT *x, blasint *INCX, void *VBETA, FLOAT *y, blasint *INCY)
 {

  blasint n    = *N;
  blasint incx = *INCX;
  blasint incy = *INCY;
+  FLOAT* ALPHA = (FLOAT*) VALPHA;
+  FLOAT* BETA = (FLOAT*) VBETA;

 #else

--- a/interface/zimatcopy.c
+++ b/interface/zimatcopy.c
@ -183,7 +183,10 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows,
    }
 #endif

-	msize = (size_t)(*rows) * (*cols) * sizeof(FLOAT) * 2;
+		if ( *rows >  *cols )
+                msize = (size_t)(*rows) * (*ldb)  * sizeof(FLOAT) * 2;
+        else
+                msize = (size_t)(*cols) * (*ldb)  * sizeof(FLOAT) * 2;

 	b = malloc(msize);
 	if ( b == NULL )
--- a/kernel/arm/zscal.c
+++ b/kernel/arm/zscal.c
@ -60,6 +60,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 			else
 			{
 				temp = - da_i * x[ip+1] ;
+				if (isnan(x[ip]) || isinf(x[ip])) temp = NAN;
 				x[ip+1] = da_i * x[ip]  ;
 			}
 		}
--- a/kernel/arm64/KERNEL.A64FX
+++ b/kernel/arm64/KERNEL.A64FX
@ -1,206 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-STRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-STRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-STRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-DTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-DTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-DTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-DTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-TRSMCOPYLN_M    = trsm_lncopy_sve.c
-TRSMCOPYLT_M    = trsm_ltcopy_sve.c
-TRSMCOPYUN_M    = trsm_uncopy_sve.c
-TRSMCOPYUT_M    = trsm_utcopy_sve.c
-
-CTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-CTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-CTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-CTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMKERNEL_LN	= trsm_kernel_LN_sve.c
-ZTRSMKERNEL_LT	= trsm_kernel_LT_sve.c
-ZTRSMKERNEL_RN	= trsm_kernel_RN_sve.c
-ZTRSMKERNEL_RT	= trsm_kernel_RT_sve.c
-
-ZTRSMCOPYLN_M    = ztrsm_lncopy_sve.c
-ZTRSMCOPYLT_M    = ztrsm_ltcopy_sve.c
-ZTRSMCOPYUN_M    = ztrsm_uncopy_sve.c
-ZTRSMCOPYUT_M    = ztrsm_utcopy_sve.c
-
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-SGEMMKERNEL    =  sgemm_kernel_sve_v2x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_sve_v1x$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPY    =  gemm_ncopy_sve_v1x$(SGEMM_UNROLL_N).c
-SGEMMITCOPY    =  gemm_tcopy_sve_v1x$(SGEMM_UNROLL_N).c
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-STRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-STRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-STRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-STRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-SSYMMUCOPY_M    =  symm_ucopy_sve.c
-SSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-DGEMMKERNEL    =  dgemm_kernel_sve_v2x$(DGEMM_UNROLL_N).S
-DTRMMKERNEL    =  dtrmm_kernel_sve_v1x$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPY    =  gemm_ncopy_sve_v1x$(DGEMM_UNROLL_N).c
-DGEMMITCOPY    =  gemm_tcopy_sve_v1x$(DGEMM_UNROLL_N).c
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMUNCOPY_M  =  trmm_uncopy_sve_v1.c
-DTRMMLNCOPY_M  =  trmm_lncopy_sve_v1.c
-DTRMMUTCOPY_M  =  trmm_utcopy_sve_v1.c
-DTRMMLTCOPY_M  =  trmm_ltcopy_sve_v1.c
-
-DSYMMUCOPY_M    =  symm_ucopy_sve.c
-DSYMMLCOPY_M    =  symm_lcopy_sve.c
-
-CGEMMKERNEL    =  cgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-CTRMMKERNEL    =  ctrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-CGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-CGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-CTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-CTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-CTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-CHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-CHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-CSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-CSYMMLCOPY_M    =  zsymm_lcopy_sve.c
-
-ZGEMMKERNEL    =  zgemm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-ZTRMMKERNEL    =  ztrmm_kernel_sve_v1x$(ZGEMM_UNROLL_N).S
-
-ZGEMMINCOPY    =  gemm_ncopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-ZGEMMITCOPY    =  gemm_tcopy_complex_sve_v1x$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMUNCOPY_M  =  ztrmm_uncopy_sve_v1.c
-ZTRMMLNCOPY_M  =  ztrmm_lncopy_sve_v1.c
-ZTRMMUTCOPY_M  =  ztrmm_utcopy_sve_v1.c
-ZTRMMLTCOPY_M  =  ztrmm_ltcopy_sve_v1.c
-
-ZHEMMLTCOPY_M    =  zhemm_ltcopy_sve.c
-ZHEMMUTCOPY_M    =  zhemm_utcopy_sve.c
-
-ZSYMMUCOPY_M    =  zsymm_ucopy_sve.c
-ZSYMMLCOPY_M    =  zsymm_lcopy_sve.c
+include $(KERNELDIR)/KERNEL.ARMV8SVE
--- a/kernel/arm64/KERNEL.CORTEXA55
+++ b/kernel/arm64/KERNEL.CORTEXA55
@ -1,196 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = axpy.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-
-SASUMKERNEL    = asum.S
-DASUMKERNEL    = asum.S
-CASUMKERNEL    = casum.S
-ZASUMKERNEL    = zasum.S
-
-SCOPYKERNEL    = copy.S
-DCOPYKERNEL    = copy.S
-CCOPYKERNEL    = copy.S
-ZCOPYKERNEL    = copy.S
-
-SSWAPKERNEL    = swap.S
-DSWAPKERNEL    = swap.S
-CSWAPKERNEL    = swap.S
-ZSWAPKERNEL    = swap.S
-
-ISAMAXKERNEL   = iamax.S
-IDAMAXKERNEL   = iamax.S
-ICAMAXKERNEL   = izamax.S
-IZAMAXKERNEL   = izamax.S
-
-SNRM2KERNEL    = nrm2.S
-DNRM2KERNEL    = nrm2.S
-CNRM2KERNEL    = znrm2.S
-ZNRM2KERNEL    = znrm2.S
-
-ifneq ($(C_COMPILER), PGI)
-SDOTKERNEL   = ../generic/dot.c
-else
-SDOTKERNEL   = dot.S
-endif
-DDOTKERNEL   = dot.S
-ifneq ($(C_COMPILER), PGI)
-CDOTKERNEL   = zdot.S
-ZDOTKERNEL   = zdot.S
-else
-CDOTKERNEL = ../arm/zdot.c
-ZDOTKERNEL = ../arm/zdot.c
-endif
-DSDOTKERNEL  = dot.S
-
-DGEMM_BETA     = dgemm_beta.S
-SGEMM_BETA     = sgemm_beta.S
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 8x8)
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N)_cortexa53.S
-else
-SGEMMKERNEL    =  sgemm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-endif
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-ifeq ($(SGEMM_UNROLL_M), 16)
-SGEMMITCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-endif
-ifeq ($(SGEMM_UNROLL_M), 4)
-SGEMMINCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_M).S
-else
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-endif
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-SGEMMOTCOPY    =  sgemm_tcopy_$(SGEMM_UNROLL_N).S
-SGEMMONCOPY    =  sgemm_ncopy_$(SGEMM_UNROLL_N).S
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DGEMMKERNEL    =  dgemm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N)_cortexa53.c
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CGEMMKERNEL    =  cgemm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N)_cortexa53.c
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZGEMMKERNEL    =  zgemm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N)_cortexa53.c
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+include $(KERNELDIR)/KERNEL.CORTEXA53
--- a/kernel/arm64/KERNEL.THUNDERX3T110
+++ b/kernel/arm64/KERNEL.THUNDERX3T110
@ -1,184 +1 @@
-SAMINKERNEL  = ../arm/amin.c
-DAMINKERNEL  = ../arm/amin.c
-CAMINKERNEL  = ../arm/zamin.c
-ZAMINKERNEL  = ../arm/zamin.c
-
-SMAXKERNEL   = ../arm/max.c
-DMAXKERNEL   = ../arm/max.c
-
-SMINKERNEL   = ../arm/min.c
-DMINKERNEL   = ../arm/min.c
-
-ISAMINKERNEL = ../arm/iamin.c
-IDAMINKERNEL = ../arm/iamin.c
-ICAMINKERNEL = ../arm/izamin.c
-IZAMINKERNEL = ../arm/izamin.c
-
-ISMAXKERNEL  = ../arm/imax.c
-IDMAXKERNEL  = ../arm/imax.c
-
-ISMINKERNEL  = ../arm/imin.c
-IDMINKERNEL  = ../arm/imin.c
-
-STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
-STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
-STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
-STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
-
-DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
-ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
-ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
-ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
-
-SAMAXKERNEL  = amax.S
-DAMAXKERNEL  = amax.S
-CAMAXKERNEL  = zamax.S
-ZAMAXKERNEL  = zamax.S
-
-SAXPYKERNEL  = axpy.S
-DAXPYKERNEL  = daxpy_thunderx2t99.S
-CAXPYKERNEL  = zaxpy.S
-ZAXPYKERNEL  = zaxpy.S
-
-SROTKERNEL   = rot.S
-DROTKERNEL   = rot.S
-CROTKERNEL   = zrot.S
-ZROTKERNEL   = zrot.S
-
-SSCALKERNEL  = scal.S
-DSCALKERNEL  = scal.S
-CSCALKERNEL  = zscal.S
-ZSCALKERNEL  = zscal.S
-
-SGEMVNKERNEL = gemv_n.S
-DGEMVNKERNEL = gemv_n.S
-CGEMVNKERNEL = zgemv_n.S
-ZGEMVNKERNEL = zgemv_n.S
-
-SGEMVTKERNEL = gemv_t.S
-DGEMVTKERNEL = gemv_t.S
-CGEMVTKERNEL = zgemv_t.S
-ZGEMVTKERNEL = zgemv_t.S
-
-STRMMKERNEL    =  strmm_kernel_$(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N).S
-ifneq ($(SGEMM_UNROLL_M), $(SGEMM_UNROLL_N))
-SGEMMINCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_M).c
-SGEMMITCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_M).c
-SGEMMINCOPYOBJ =  sgemm_incopy$(TSUFFIX).$(SUFFIX)
-SGEMMITCOPYOBJ =  sgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-SGEMMONCOPY    =  ../generic/gemm_ncopy_$(SGEMM_UNROLL_N).c
-SGEMMOTCOPY    =  ../generic/gemm_tcopy_$(SGEMM_UNROLL_N).c
-SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
-SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-DTRMMKERNEL    =  dtrmm_kernel_$(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N).S
-
-ifneq ($(DGEMM_UNROLL_M), $(DGEMM_UNROLL_N))
-
-ifeq ($(DGEMM_UNROLL_M), 8)
-DGEMMINCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_M).S
-DGEMMITCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_M).S
-else
-DGEMMINCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_M).c
-DGEMMITCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_M).c
-endif
-
-DGEMMINCOPYOBJ =  dgemm_incopy$(TSUFFIX).$(SUFFIX)
-DGEMMITCOPYOBJ =  dgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-
-ifeq ($(DGEMM_UNROLL_N), 4)
-DGEMMONCOPY    =  dgemm_ncopy_$(DGEMM_UNROLL_N).S
-DGEMMOTCOPY    =  dgemm_tcopy_$(DGEMM_UNROLL_N).S
-else
-DGEMMONCOPY    =  ../generic/gemm_ncopy_$(DGEMM_UNROLL_N).c
-DGEMMOTCOPY    =  ../generic/gemm_tcopy_$(DGEMM_UNROLL_N).c
-endif
-
-DGEMMONCOPYOBJ =  dgemm_oncopy$(TSUFFIX).$(SUFFIX)
-DGEMMOTCOPYOBJ =  dgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-CTRMMKERNEL    =  ctrmm_kernel_$(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N).S
-ifneq ($(CGEMM_UNROLL_M), $(CGEMM_UNROLL_N))
-CGEMMINCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_M).c
-CGEMMITCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_M).c
-CGEMMINCOPYOBJ =  cgemm_incopy$(TSUFFIX).$(SUFFIX)
-CGEMMITCOPYOBJ =  cgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-CGEMMONCOPY    =  ../generic/zgemm_ncopy_$(CGEMM_UNROLL_N).c
-CGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(CGEMM_UNROLL_N).c
-CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
-CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-ZTRMMKERNEL    =  ztrmm_kernel_$(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N).S
-ifneq ($(ZGEMM_UNROLL_M), $(ZGEMM_UNROLL_N))
-ZGEMMINCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_M).c
-ZGEMMITCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_M).c
-ZGEMMINCOPYOBJ =  zgemm_incopy$(TSUFFIX).$(SUFFIX)
-ZGEMMITCOPYOBJ =  zgemm_itcopy$(TSUFFIX).$(SUFFIX)
-endif
-ZGEMMONCOPY    =  ../generic/zgemm_ncopy_$(ZGEMM_UNROLL_N).c
-ZGEMMOTCOPY    =  ../generic/zgemm_tcopy_$(ZGEMM_UNROLL_N).c
-ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
-ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
-
-SASUMKERNEL    = sasum_thunderx2t99.c
-DASUMKERNEL    = dasum_thunderx2t99.c
-CASUMKERNEL    = casum_thunderx2t99.c
-ZASUMKERNEL    = zasum_thunderx2t99.c
-
-SCOPYKERNEL    = copy_thunderx2t99.c
-DCOPYKERNEL    = copy_thunderx2t99.c
-CCOPYKERNEL    = copy_thunderx2t99.c
-ZCOPYKERNEL    = copy_thunderx2t99.c
-
-SSWAPKERNEL    = swap_thunderx2t99.S
-DSWAPKERNEL    = swap_thunderx2t99.S
-CSWAPKERNEL    = swap_thunderx2t99.S
-ZSWAPKERNEL    = swap_thunderx2t99.S
-
-ISAMAXKERNEL   = iamax_thunderx2t99.c
-IDAMAXKERNEL   = iamax_thunderx2t99.c
-ICAMAXKERNEL   = izamax_thunderx2t99.c
-IZAMAXKERNEL   = izamax_thunderx2t99.c
-
-SNRM2KERNEL    = scnrm2_thunderx2t99.c
-CNRM2KERNEL    = scnrm2_thunderx2t99.c
-#DNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-#ZNRM2KERNEL    = dznrm2_thunderx2t99_fast.c
-DNRM2KERNEL    = dznrm2_thunderx2t99.c
-ZNRM2KERNEL    = dznrm2_thunderx2t99.c
-
-
-DDOTKERNEL     = dot.c
-SDOTKERNEL     = dot.c
-CDOTKERNEL     = zdot_thunderx2t99.c
-ZDOTKERNEL     = zdot_thunderx2t99.c
-DSDOTKERNEL    = dot.S
-
-ifeq ($(DGEMM_UNROLL_M)x$(DGEMM_UNROLL_N), 8x4)
-DGEMMKERNEL    = dgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(SGEMM_UNROLL_M)x$(SGEMM_UNROLL_N), 16x4)
-SGEMMKERNEL    =  sgemm_kernel_16x4_thunderx2t99.S
-endif
-
-ifeq ($(CGEMM_UNROLL_M)x$(CGEMM_UNROLL_N), 8x4)
-CGEMMKERNEL    =  cgemm_kernel_8x4_thunderx2t99.S
-endif
-
-ifeq ($(ZGEMM_UNROLL_M)x$(ZGEMM_UNROLL_N), 4x4)
-ZGEMMKERNEL    =  zgemm_kernel_4x4_thunderx2t99.S
-endif
+include $(KERNELDIR)/KERNEL.THUNDERX2T99
--- a/kernel/arm64/dot_kernel_sve.c
+++ b/kernel/arm64/dot_kernel_sve.c
@ -1,4 +1,5 @@
 /***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
 Copyright (c) 2022, Arm Ltd
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
@ -30,37 +31,84 @@ THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #include <arm_sve.h>

 #ifdef DOUBLE
-#define SVE_TYPE svfloat64_t
-#define SVE_ZERO svdup_f64(0.0)
-#define SVE_WHILELT svwhilelt_b64
-#define SVE_ALL svptrue_b64()
-#define SVE_WIDTH svcntd()
+#define DTYPE "d"
+#define WIDTH "d"
+#define SHIFT "3"
 #else
-#define SVE_TYPE svfloat32_t
-#define SVE_ZERO svdup_f32(0.0)
-#define SVE_WHILELT svwhilelt_b32
-#define SVE_ALL svptrue_b32()
-#define SVE_WIDTH svcntw()
+#define DTYPE "s"
+#define WIDTH "w"
+#define SHIFT "2"
 #endif

-static FLOAT dot_kernel_sve(BLASLONG n, FLOAT *x, FLOAT *y) {
-        SVE_TYPE acc_a = SVE_ZERO;
-        SVE_TYPE acc_b = SVE_ZERO;
+#define COUNT \
+"        cnt"WIDTH"    x9                                   \n"
+#define SETUP_TRUE \
+"        ptrue   p0."DTYPE"                              \n"
+#define OFFSET_INPUTS                                     \
+"        add     x12, %[X_], x9, lsl #"SHIFT"               \n" \
+"        add     x13, %[Y_], x9, lsl #"SHIFT"               \n"
+#define TAIL_WHILE                                        \
+"        whilelo p1."DTYPE", x8, x0                         \n"
+#define UPDATE(pg, x,y,out)                               \
+"        ld1"WIDTH"    { z2."DTYPE" }, "pg"/z, ["x", x8, lsl #"SHIFT"]  \n" \
+"        ld1"WIDTH"    { z3."DTYPE" }, "pg"/z, ["y", x8, lsl #"SHIFT"]  \n" \
+"        fmla    "out"."DTYPE", "pg"/m, z2."DTYPE", z3."DTYPE"      \n"
+#define SUM_VECTOR(v) \
+"        faddv   "DTYPE""v", p0, z"v"."DTYPE"                     \n"
+#define RET \
+"        fadd    %"DTYPE"[RET_], "DTYPE"1, "DTYPE"0                     \n"

-        BLASLONG sve_width = SVE_WIDTH;
+#define DOT_KERNEL                                        \
+        COUNT                                             \
+"        mov     z1.d, #0                             \n" \
+"        mov     z0.d, #0                             \n" \
+"        mov     x8, #0                               \n" \
+"        movi    d1, #0x0                             \n" \
+        SETUP_TRUE                                        \
+"        neg     x10, x9, lsl #1                      \n" \
+"        ands    x11, x10, x0                         \n" \
+"        b.eq    2f // skip_2x                        \n" \
+        OFFSET_INPUTS                                     \
+"1: // vector_2x                                      \n" \
+        UPDATE("p0", "%[X_]", "%[Y_]", "z1") \
+        UPDATE("p0", "x12", "x13", "z0") \
+"        sub     x8, x8, x10                          \n" \
+"        cmp     x8, x11                              \n" \
+"        b.lo    1b // vector_2x                      \n" \
+        SUM_VECTOR("1") \
+"2: // skip_2x                                        \n" \
+"        neg     x10, x9                              \n" \
+"        and     x10, x10, x0                         \n" \
+"        cmp     x8, x10                              \n" \
+"        b.hs    4f // tail                           \n" \
+"3: // vector_1x                                      \n" \
+        UPDATE("p0", "%[X_]", "%[Y_]", "z0")              \
+"        add     x8, x8, x9                           \n" \
+"        cmp     x8, x10                              \n" \
+"        b.lo    3b // vector_1x                      \n" \
+"4: // tail                                           \n" \
+"        cmp     x10, x0                              \n" \
+"        b.eq    5f // end                            \n" \
+        TAIL_WHILE                                        \
+        UPDATE("p1", "%[X_]", "%[Y_]", "z0")              \
+"5: // end                                            \n" \
+        SUM_VECTOR("0") \
+        RET

-        for (BLASLONG i = 0; i < n; i += sve_width * 2) {
-                svbool_t pg_a = SVE_WHILELT((uint64_t)i, (uint64_t)n);
-                svbool_t pg_b = SVE_WHILELT((uint64_t)(i + sve_width), (uint64_t)n);
+static
+FLOAT
+dot_kernel_sve(BLASLONG n, FLOAT* x, FLOAT* y)
+{
+  FLOAT ret;

-                SVE_TYPE x_vec_a = svld1(pg_a, &x[i]);
-                SVE_TYPE y_vec_a = svld1(pg_a, &y[i]);
-                SVE_TYPE x_vec_b = svld1(pg_b, &x[i + sve_width]);
-                SVE_TYPE y_vec_b = svld1(pg_b, &y[i + sve_width]);
+  asm(DOT_KERNEL
+      :
+        [RET_] "=&w" (ret)
+      :
+        [N_] "r" (n),
+        [X_] "r" (x),
+        [Y_] "r" (y)
+      :);

-                acc_a = svmla_m(pg_a, acc_a, x_vec_a, y_vec_a);
-                acc_b = svmla_m(pg_b, acc_b, x_vec_b, y_vec_b);
-        }
-
-        return svaddv(SVE_ALL, acc_a) + svaddv(SVE_ALL, acc_b);
+  return ret;
 }
--- a/kernel/arm64/zscal.S
+++ b/kernel/arm64/zscal.S
@ -223,7 +223,7 @@ zscal_begin:
 	fcmp	DA_I, #0.0
 	beq	.Lzscal_kernel_RI_zero

-	b	.Lzscal_kernel_R_zero
+//	b	.Lzscal_kernel_R_zero

 .Lzscal_kernel_R_non_zero:

--- a/kernel/csky/KERNEL
+++ b/kernel/csky/KERNEL
@ -0,0 +1,149 @@
+SAMAXKERNEL  = ../arm/amax.c
+DAMAXKERNEL  = ../arm/amax.c
+CAMAXKERNEL  = ../arm/zamax.c
+ZAMAXKERNEL  = ../arm/zamax.c
+
+SAMINKERNEL  = ../arm/amin.c
+DAMINKERNEL  = ../arm/amin.c
+CAMINKERNEL  = ../arm/zamin.c
+ZAMINKERNEL  = ../arm/zamin.c
+
+SMAXKERNEL   = ../arm/max.c
+DMAXKERNEL   = ../arm/max.c
+
+SMINKERNEL   = ../arm/min.c
+DMINKERNEL   = ../arm/min.c
+
+ISAMAXKERNEL = ../arm/iamax.c
+IDAMAXKERNEL = ../arm/iamax.c
+ICAMAXKERNEL = ../arm/izamax.c
+IZAMAXKERNEL = ../arm/izamax.c
+
+ISAMINKERNEL = ../arm/iamin.c
+IDAMINKERNEL = ../arm/iamin.c
+ICAMINKERNEL = ../arm/izamin.c
+IZAMINKERNEL = ../arm/izamin.c
+
+ISMAXKERNEL  = ../arm/imax.c
+IDMAXKERNEL  = ../arm/imax.c
+
+ISMINKERNEL  = ../arm/imin.c
+IDMINKERNEL  = ../arm/imin.c
+
+SASUMKERNEL  = ../arm/asum.c
+DASUMKERNEL  = ../arm/asum.c
+CASUMKERNEL  = ../arm/zasum.c
+ZASUMKERNEL  = ../arm/zasum.c
+
+SSUMKERNEL  = ../arm/sum.c
+DSUMKERNEL  = ../arm/sum.c
+CSUMKERNEL  = ../arm/zsum.c
+ZSUMKERNEL  = ../arm/zsum.c
+
+SAXPYKERNEL  = ../arm/axpy.c
+DAXPYKERNEL  = ../arm/axpy.c
+CAXPYKERNEL  = ../arm/zaxpy.c
+ZAXPYKERNEL  = ../arm/zaxpy.c
+
+SCOPYKERNEL  = ../arm/copy.c
+DCOPYKERNEL  = ../arm/copy.c
+CCOPYKERNEL  = ../arm/zcopy.c
+ZCOPYKERNEL  = ../arm/zcopy.c
+
+SDOTKERNEL   = ../arm/dot.c
+DDOTKERNEL   = ../arm/dot.c
+CDOTKERNEL   = ../arm/zdot.c
+ZDOTKERNEL   = ../arm/zdot.c
+DSDOTKERNEL  = ../generic/dot.c
+
+SNRM2KERNEL  = ../arm/nrm2.c
+DNRM2KERNEL  = ../arm/nrm2.c
+CNRM2KERNEL  = ../arm/znrm2.c
+ZNRM2KERNEL  = ../arm/znrm2.c
+
+SROTKERNEL   = ../arm/rot.c
+DROTKERNEL   = ../arm/rot.c
+CROTKERNEL   = ../arm/zrot.c
+ZROTKERNEL   = ../arm/zrot.c
+
+SSCALKERNEL  = ../arm/scal.c
+DSCALKERNEL  = ../arm/scal.c
+CSCALKERNEL  = ../arm/zscal.c
+ZSCALKERNEL  = ../arm/zscal.c
+
+SSWAPKERNEL  = ../arm/swap.c
+DSWAPKERNEL  = ../arm/swap.c
+CSWAPKERNEL  = ../arm/zswap.c
+ZSWAPKERNEL  = ../arm/zswap.c
+
+SGEMVNKERNEL = ../arm/gemv_n.c
+DGEMVNKERNEL = ../arm/gemv_n.c
+CGEMVNKERNEL = ../arm/zgemv_n.c
+ZGEMVNKERNEL = ../arm/zgemv_n.c
+
+SGEMVTKERNEL = ../arm/gemv_t.c
+DGEMVTKERNEL = ../arm/gemv_t.c
+CGEMVTKERNEL = ../arm/zgemv_t.c
+ZGEMVTKERNEL = ../arm/zgemv_t.c
+
+STRMMKERNEL	= ../generic/trmmkernel_2x2.c
+DTRMMKERNEL	= ../generic/trmmkernel_2x2.c
+CTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+ZTRMMKERNEL	= ../generic/ztrmmkernel_2x2.c
+
+SGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+SGEMMONCOPY    =  ../generic/gemm_ncopy_2.c
+SGEMMOTCOPY    =  ../generic/gemm_tcopy_2.c
+SGEMMONCOPYOBJ =  sgemm_oncopy$(TSUFFIX).$(SUFFIX)
+SGEMMOTCOPYOBJ =  sgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DGEMMKERNEL    =  ../generic/gemmkernel_2x2.c
+DGEMMONCOPY    = ../generic/gemm_ncopy_2.c
+DGEMMOTCOPY    = ../generic/gemm_tcopy_2.c
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+CGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+CGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+CGEMMONCOPYOBJ =  cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ =  cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZGEMMKERNEL    = ../generic/zgemmkernel_2x2.c
+ZGEMMONCOPY    = ../generic/zgemm_ncopy_2.c
+ZGEMMOTCOPY    = ../generic/zgemm_tcopy_2.c
+ZGEMMONCOPYOBJ =  zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ =  zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+STRSMKERNEL_LN	=  ../generic/trsm_kernel_LN.c
+STRSMKERNEL_LT	=  ../generic/trsm_kernel_LT.c
+STRSMKERNEL_RN	=  ../generic/trsm_kernel_RN.c
+STRSMKERNEL_RT	=  ../generic/trsm_kernel_RT.c
+
+DTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+CTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+ZTRSMKERNEL_LN	= ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT	= ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN	= ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT	= ../generic/trsm_kernel_RT.c
+
+
+SCABS_KERNEL	= ../generic/cabs.c
+DCABS_KERNEL	= ../generic/cabs.c
+QCABS_KERNEL	= ../generic/cabs.c
+LSAME_KERNEL	= ../generic/lsame.c
+
+SGEMM_BETA = ../generic/gemm_beta.c
+DGEMM_BETA = ../generic/gemm_beta.c
+CGEMM_BETA = ../generic/zgemm_beta.c
+ZGEMM_BETA = ../generic/zgemm_beta.c
+
+
--- a/kernel/csky/Makefile
+++ b/kernel/csky/Makefile
@ -0,0 +1 @@
+clean ::
--- a/kernel/generic/trmmkernel_16x8.c
+++ b/kernel/generic/trmmkernel_16x8.c
--- a/kernel/generic/zimatcopy_cnc.c
+++ b/kernel/generic/zimatcopy_cnc.c
@ -40,7 +40,6 @@ int CNAME(BLASLONG rows, BLASLONG cols, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a,

 	if ( rows <= 0     )  return(0);
 	if ( cols <= 0     )  return(0);
-    if ( alpha_r == 1.0 && alpha_i == 0.0 ) return (0); 

 	aptr = a;
 	lda *= 2;
--- a/kernel/generic/zlaswp_ncopy_8.c
+++ b/kernel/generic/zlaswp_ncopy_8.c
--- a/kernel/loongarch64/KERNEL.LOONGSON2K1000
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@ -3,56 +3,126 @@ ifndef NO_LSX
 SDOTKERNEL  = dot_lsx.S
 DSDOTKERNEL = dot_lsx.S
 DDOTKERNEL  = dot_lsx.S
+CDOTKERNEL  = cdot_lsx.S
+ZDOTKERNEL  = cdot_lsx.S

-SSCALKERNEL  = sscal_lsx.S
-DSCALKERNEL  = dscal_lsx.S
+SSCALKERNEL  = scal_lsx.S
+DSCALKERNEL  = scal_lsx.S
+CSCALKERNEL  = cscal_lsx.S
+ZSCALKERNEL  = cscal_lsx.S

-SAMAXKERNEL =  samax_lsx.S
-DAMAXKERNEL =  damax_lsx.S
+SAMAXKERNEL =  amax_lsx.S
+DAMAXKERNEL =  amax_lsx.S
+CAMAXKERNEL =  camax_lsx.S
+ZAMAXKERNEL =  camax_lsx.S

-SAMINKERNEL =  samin_lsx.S
-DAMINKERNEL =  damin_lsx.S
+SAMINKERNEL =  amin_lsx.S
+DAMINKERNEL =  amin_lsx.S
+CAMINKERNEL =  camin_lsx.S
+ZAMINKERNEL =  camin_lsx.S

-SMAXKERNEL  =  smax_lsx.S
-DMAXKERNEL  =  dmax_lsx.S
+SMAXKERNEL  =  max_lsx.S
+DMAXKERNEL  =  max_lsx.S

-SMINKERNEL  =  smin_lsx.S
-DMINKERNEL  =  dmin_lsx.S
+SMINKERNEL  =  min_lsx.S
+DMINKERNEL  =  min_lsx.S

-ISMAXKERNEL =  ismax_lsx.S
-IDMAXKERNEL =  idmax_lsx.S
+ISMAXKERNEL =  imax_lsx.S
+IDMAXKERNEL =  imax_lsx.S

-ISMINKERNEL =  ismin_lsx.S
-IDMINKERNEL =  idmin_lsx.S
+ISMINKERNEL =  imin_lsx.S
+IDMINKERNEL =  imin_lsx.S

-ISAMAXKERNEL = isamax_lsx.S
-IDAMAXKERNEL = idamax_lsx.S
+ISAMAXKERNEL = iamax_lsx.S
+IDAMAXKERNEL = iamax_lsx.S
+ICAMAXKERNEL = icamax_lsx.S
+IZAMAXKERNEL = icamax_lsx.S

-ISAMINKERNEL = isamin_lsx.S
-IDAMINKERNEL = idamin_lsx.S
+ISAMINKERNEL = iamin_lsx.S
+IDAMINKERNEL = iamin_lsx.S
+ICAMINKERNEL = icamin_lsx.S
+IZAMINKERNEL = icamin_lsx.S

-SCOPYKERNEL =  scopy_lsx.S
-DCOPYKERNEL =  dcopy_lsx.S
+SCOPYKERNEL =  copy_lsx.S
+DCOPYKERNEL =  copy_lsx.S
+CCOPYKERNEL =  ccopy_lsx.S
+ZCOPYKERNEL =  ccopy_lsx.S

-SSWAPKERNEL =  sswap_lsx.S
-DSWAPKERNEL =  dswap_lsx.S
+SSWAPKERNEL =  swap_lsx.S
+DSWAPKERNEL =  swap_lsx.S

-SAXPYKERNEL =  saxpy_lsx.S
-DAXPYKERNEL =  daxpy_lsx.S
+SAXPYKERNEL =  axpy_lsx.S
+DAXPYKERNEL =  axpy_lsx.S
+CAXPYKERNEL =  caxpy_lsx.S
+ZAXPYKERNEL =  caxpy_lsx.S

-SAXPBYKERNEL = saxpby_lsx.S
-DAXPBYKERNEL = daxpby_lsx.S
+SAXPBYKERNEL = axpby_lsx.S
+DAXPBYKERNEL = axpby_lsx.S
+CAXPBYKERNEL = caxpby_lsx.S
+ZAXPBYKERNEL = caxpby_lsx.S

-SSUMKERNEL  =  ssum_lsx.S
-DSUMKERNEL  =  dsum_lsx.S
+SSUMKERNEL  =  sum_lsx.S
+DSUMKERNEL  =  sum_lsx.S

-SASUMKERNEL =  sasum_lsx.S
-DASUMKERNEL =  dasum_lsx.S
+SASUMKERNEL =  asum_lsx.S
+DASUMKERNEL =  asum_lsx.S
+CASUMKERNEL =  casum_lsx.S
+ZASUMKERNEL =  casum_lsx.S

-SROTKERNEL  =  srot_lsx.S
-DROTKERNEL  =  drot_lsx.S
+SROTKERNEL  =  rot_lsx.S
+DROTKERNEL  =  rot_lsx.S
+CROTKERNEL  =  crot_lsx.S
+ZROTKERNEL  =  crot_lsx.S

 SNRM2KERNEL =  snrm2_lsx.S
 DNRM2KERNEL =  dnrm2_lsx.S
+CNRM2KERNEL =  cnrm2_lsx.S
+ZNRM2KERNEL =  znrm2_lsx.S

+CSWAPKERNEL = cswap_lsx.S
+ZSWAPKERNEL = cswap_lsx.S
+
+CSUMKERNEL = csum_lsx.S
+ZSUMKERNEL = csum_lsx.S
+
+DGEMMKERNEL    = dgemm_kernel_8x4.S
+DGEMMINCOPY    = dgemm_ncopy_8_lsx.S
+DGEMMITCOPY    = dgemm_tcopy_8_lsx.S
+DGEMMONCOPY    = dgemm_ncopy_4_lsx.S
+DGEMMOTCOPY    = dgemm_tcopy_4_lsx.S
+DGEMMINCOPYOBJ = dgemm_incopy$(TSUFFIX).$(SUFFIX)
+DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
+DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
+DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+DTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+DTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+DTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+DTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+CGEMMKERNEL  = cgemm_kernel_8x4_lsx.S
+CGEMMINCOPY  = cgemm_ncopy_8_lsx.S
+CGEMMITCOPY  = cgemm_tcopy_8_lsx.S
+CGEMMONCOPY  = cgemm_ncopy_4_lsx.S
+CGEMMOTCOPY  = cgemm_tcopy_4_lsx.S
+CGEMMINCOPYOBJ = cgemm_incopy$(TSUFFIX).$(SUFFIX)
+CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZGEMMKERNEL  = zgemm_kernel_4x4_lsx.S
+ZGEMMONCOPY  = zgemm_ncopy_4_lsx.S
+ZGEMMOTCOPY  = zgemm_tcopy_4_lsx.S
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
 endif
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@ -3,57 +3,87 @@ ifndef NO_LASX
 SDOTKERNEL  = dot_lasx.S
 DSDOTKERNEL = dot_lasx.S
 DDOTKERNEL  = dot_lasx.S
+CDOTKERNEL  = cdot_lasx.S
+ZDOTKERNEL  = cdot_lasx.S

-SSCALKERNEL  = sscal_lasx.S
-DSCALKERNEL  = dscal_lasx.S
+SSCALKERNEL  = scal_lasx.S
+DSCALKERNEL  = scal_lasx.S
+CSCALKERNEL  = cscal_lasx.S
+ZSCALKERNEL  = cscal_lasx.S

-SAMAXKERNEL =  samax_lasx.S
-DAMAXKERNEL =  damax_lasx.S
+SAMAXKERNEL =  amax_lasx.S
+DAMAXKERNEL =  amax_lasx.S
+CAMAXKERNEL =  camax_lasx.S
+ZAMAXKERNEL =  camax_lasx.S

-SAMINKERNEL =  samin_lasx.S
-DAMINKERNEL =  damin_lasx.S
+SAMINKERNEL =  amin_lasx.S
+DAMINKERNEL =  amin_lasx.S
+CAMINKERNEL =  camin_lasx.S
+ZAMINKERNEL =  camin_lasx.S

-SMAXKERNEL  =  smax_lasx.S
-DMAXKERNEL =   dmax_lasx.S
+SMAXKERNEL  =  max_lsx.S
+DMAXKERNEL =   max_lsx.S

-SMINKERNEL =   smin_lasx.S
-DMINKERNEL =   dmin_lasx.S
+SMINKERNEL =   min_lsx.S
+DMINKERNEL =   min_lsx.S

-ISMAXKERNEL =  ismax_lasx.S
-IDMAXKERNEL =  idmax_lasx.S
+ISMAXKERNEL =  imax_lasx.S
+IDMAXKERNEL =  imax_lasx.S

-ISMINKERNEL =  ismin_lasx.S
-IDMINKERNEL =  idmin_lasx.S
+ISMINKERNEL =  imin_lasx.S
+IDMINKERNEL =  imin_lasx.S

-ISAMAXKERNEL = isamax_lasx.S
-IDAMAXKERNEL = idamax_lasx.S
+ISAMAXKERNEL = iamax_lasx.S
+IDAMAXKERNEL = iamax_lasx.S
+ICAMAXKERNEL = icamax_lasx.S
+IZAMAXKERNEL = icamax_lasx.S

-ISAMINKERNEL = isamin_lasx.S
-IDAMINKERNEL = idamin_lasx.S
+ISAMINKERNEL = iamin_lasx.S
+IDAMINKERNEL = iamin_lasx.S
+ICAMINKERNEL = icamin_lasx.S
+IZAMINKERNEL = icamin_lasx.S

-SCOPYKERNEL =  scopy_lasx.S
-DCOPYKERNEL =  dcopy_lasx.S
+SCOPYKERNEL =  copy_lasx.S
+DCOPYKERNEL =  copy_lasx.S
+CCOPYKERNEL =  ccopy_lasx.S
+ZCOPYKERNEL =  ccopy_lasx.S

-SSWAPKERNEL =  sswap_lasx.S
-DSWAPKERNEL =  dswap_lasx.S
+SSWAPKERNEL =  swap_lasx.S
+DSWAPKERNEL =  swap_lasx.S

-SAXPYKERNEL =  saxpy_lasx.S
-DAXPYKERNEL =  daxpy_lasx.S
+SAXPYKERNEL =  axpy_lasx.S
+DAXPYKERNEL =  axpy_lasx.S
+CAXPYKERNEL =  caxpy_lasx.S
+ZAXPYKERNEL =  caxpy_lasx.S

-SAXPBYKERNEL = saxpby_lasx.S
-DAXPBYKERNEL = daxpby_lasx.S
+SAXPBYKERNEL = axpby_lasx.S
+DAXPBYKERNEL = axpby_lasx.S
+CAXPBYKERNEL = caxpby_lasx.S
+ZAXPBYKERNEL = caxpby_lasx.S

-SSUMKERNEL  =  ssum_lasx.S
-DSUMKERNEL  =  dsum_lasx.S
+SSUMKERNEL  =  sum_lasx.S
+DSUMKERNEL  =  sum_lasx.S

-SASUMKERNEL =  sasum_lasx.S
-DASUMKERNEL =  dasum_lasx.S
+SASUMKERNEL =  asum_lasx.S
+DASUMKERNEL =  asum_lasx.S
+CASUMKERNEL =  casum_lasx.S
+ZASUMKERNEL =  casum_lasx.S

-SROTKERNEL  =  srot_lasx.S
-DROTKERNEL  =  drot_lasx.S
+SROTKERNEL  =  rot_lasx.S
+DROTKERNEL  =  rot_lasx.S
+CROTKERNEL  =  crot_lasx.S
+ZROTKERNEL  =  crot_lasx.S

 SNRM2KERNEL =  snrm2_lasx.S
 DNRM2KERNEL =  dnrm2_lasx.S
+CNRM2KERNEL =  cnrm2_lasx.S
+ZNRM2KERNEL =  znrm2_lasx.S
+
+CSWAPKERNEL = cswap_lasx.S
+ZSWAPKERNEL = cswap_lasx.S
+
+CSUMKERNEL = csum_lasx.S
+ZSUMKERNEL = csum_lasx.S

 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
@ -81,13 +111,39 @@ SGEMMOTCOPYOBJ = sgemm_otcopy$(TSUFFIX).$(SUFFIX)
 SGEMVNKERNEL = sgemv_n_8_lasx.S
 SGEMVTKERNEL = sgemv_t_8_lasx.S

+CGEMMKERNEL  = cgemm_kernel_2x2_lsx.S
+CGEMMONCOPY  = cgemm_ncopy_2_lsx.S
+CGEMMOTCOPY  = cgemm_tcopy_2_lsx.S
+CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
+CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+CTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
+ZGEMMKERNEL  = zgemm_kernel_8x4_lasx.S
+ZGEMMINCOPY  = zgemm_ncopy_8_lasx.S
+ZGEMMITCOPY  = zgemm_tcopy_8_lasx.S
+ZGEMMONCOPY  = zgemm_ncopy_4_lasx.S
+ZGEMMOTCOPY  = zgemm_tcopy_4_lasx.S
+ZGEMMINCOPYOBJ = zgemm_incopy$(TSUFFIX).$(SUFFIX)
+ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
+ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
+ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
+
+ZTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
+ZTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
+ZTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
+ZTRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+
 DTRSMKERNEL_LN  = dtrsm_kernel_LN_16x4_lasx.S
 DTRSMKERNEL_LT  = dtrsm_kernel_LT_16x4_lasx.S
 DTRSMKERNEL_RN  = dtrsm_kernel_RN_16x4_lasx.S
 DTRSMKERNEL_RT  = dtrsm_kernel_RT_16x4_lasx.S
-endif

 STRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 STRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 STRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
 STRSMKERNEL_RT  = ../generic/trsm_kernel_RT.c
+endif
--- a/kernel/loongarch64/amax_lasx.S
+++ b/kernel/loongarch64/amax_lasx.S
@ -0,0 +1,232 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    XVFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM0, VM1
+    XVFMAXA   VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAXA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMAXA   VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMAXA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMAXA   VM1, VX0, VX1
+    XVFMAXA   VM2, VX2, VX3
+    XVFMAXA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMAXA   VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
--- a/kernel/loongarch64/amax_lsx.S
+++ b/kernel/loongarch64/amax_lsx.S
@ -0,0 +1,231 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    VFSUB VM0, VM0, VM0
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM0, VM1
+    VFMAXA   VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAXA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMAXA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMAXA   VM1, VX0, VX1
+    VFMAXA   VM2, VX2, VX3
+    VFMAXA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMAXA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
--- a/kernel/loongarch64/amin_lasx.S
+++ b/kernel/loongarch64/amin_lasx.S
@ -0,0 +1,231 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $xr0
+#define VM1 $xr1
+#define VM2 $xr2
+#define VX0 $xr3
+#define VX1 $xr4
+#define VX2 $xr5
+#define VX3 $xr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    xvldrepl.d VM0, X, 0
+#else
+    xvldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 4
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    xvld VX2, X, 64
+    xvld VX3, X, 96
+    addi.d I, I, -1
+    addi.d X, X, 128
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM0, VM1
+    XVFMINA   VM0, VM0, VM2
+#else
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+    addi.d I, I, -1
+    addi.d X, X, 64
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMINA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 0x1
+    XVFMINA   VM0, VM0, VM1
+    .align 3
+
+.L11:
+    andi  I, N, 0x0f
+    bge $r0, I, .L13
+    .align 3
+
+.L12: /* 0 < N < 16 */
+    LD    $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMINA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0
+    add.d X, X, INCX
+    ld.d t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0
+    add.d X, X, INCX
+    ld.d t2, X, 0
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmaxa.d VM1, VX0, VX1
+    xvfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 0
+    xvinsgr2vr.w VM1, t2, 1
+    xvinsgr2vr.w VM1, t3, 2
+    xvinsgr2vr.w VM1, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    xvinsgr2vr.w VM1, t1, 4
+    xvinsgr2vr.w VM1, t2, 5
+    xvinsgr2vr.w VM1, t3, 6
+    xvinsgr2vr.w VM1, t4, 7
+    xvfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvrepl128vei.d VX0, VM0, 0
+    xvrepl128vei.d VX1, VM0, 1
+    XVFMINA   VM0, VX0, VX1
+#else
+    xvrepl128vei.w VX0, VM0, 0
+    xvrepl128vei.w VX1, VM0, 1
+    xvrepl128vei.w VX2, VM0, 2
+    xvrepl128vei.w VX3, VM0, 3
+    XVFMINA   VM1, VX0, VX1
+    XVFMINA   VM2, VX2, VX3
+    XVFMINA   VM0, VM1, VM2
+#endif
+    xvpermi.q VM1, VM0, 1
+    XVFMINA   VM0, VM0, VM1
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24: /* 0 < N < 8 */
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMINA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
--- a/kernel/loongarch64/amin_lsx.S
+++ b/kernel/loongarch64/amin_lsx.S
@ -0,0 +1,230 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N     $r4
+#define X     $r5
+#define INCX  $r6
+
+#define I     $r12
+#define TEMP  $r13
+
+#define VM0 $vr0
+#define VM1 $vr1
+#define VM2 $vr2
+#define VX0 $vr3
+#define VX1 $vr4
+#define VX2 $vr5
+#define VX3 $vr6
+
+#define t1 $r14
+#define t2 $r15
+#define t3 $r16
+#define t4 $r17
+
+    PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT   N,     0(N)
+    LDINT   INCX,  0(INCX)
+#endif
+
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, BASE_SHIFT
+    slli.d INCX, INCX, BASE_SHIFT
+#ifdef DOUBLE
+    vldrepl.d VM0, X, 0
+#else
+    vldrepl.w VM0, X, 0
+#endif
+    bne INCX, TEMP, .L20
+
+    srai.d I, N, 3
+    bge $r0, I, .L11
+    .align 3
+
+.L10:
+#ifdef DOUBLE
+    vld VX0, X, 0
+    vld VX1, X, 16
+    vld VX2, X, 32
+    vld VX3, X, 48
+    addi.d I, I, -1
+    addi.d X, X, 64
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM0, VM1
+    VFMINA   VM0, VM0, VM2
+#else
+    vld VX0, X, 0
+    vld VX1, X, 16
+    addi.d I, I, -1
+    addi.d X, X, 32
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM0, VM0, VM1
+#endif
+    blt $r0, I, .L10
+
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMINA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L11:
+    andi I, N, 7
+    bge $r0, I, .L13
+    .align 3
+
+.L12:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    addi.d  X, X, SIZE
+    FMINA $f0, $f0, $f1
+    bnez    I, .L12
+    .align 3
+
+.L13:
+    FABS $f0, $f0
+    jirl $r0,  $r1, 0x0
+    .align 3
+
+.L20: // INCX!=1
+    srai.d I, N, 3
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM1, VX0, VX1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vfmaxa.d VM2, VX0, VX1
+    vfmaxa.d VM1, VM1, VM2
+    vfmaxa.d VM0, VM0, VM1
+#else
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0
+    add.d X, X, INCX
+    ld.w t2, X, 0
+    add.d X, X, INCX
+    ld.w t3, X, 0
+    add.d X, X, INCX
+    ld.w t4, X, 0
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmaxa.s VM1, VX0, VX1
+    vfmaxa.s VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX0, VM0, 0
+    vreplvei.d VX1, VM0, 1
+    VFMINA   VM0, VX0, VX1
+#else
+    vreplvei.w VX0, VM0, 0
+    vreplvei.w VX1, VM0, 1
+    vreplvei.w VX2, VM0, 2
+    vreplvei.w VX3, VM0, 3
+    VFMINA   VM1, VX0, VX1
+    VFMINA   VM2, VX2, VX3
+    VFMINA   VM0, VM1, VM2
+#endif
+    .align 3
+
+.L23: //INCX!=1 and N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f1, X, 0
+    addi.d  I, I, -1
+    add.d   X, X, INCX
+    FMINA $f0, $f0, $f1
+    bnez    I, .L24
+    .align 3
+
+.L999:
+    FABS $f0, $f0
+    jirl $r0, $r1, 0x0
+
+    EPILOGUE
--- a/kernel/loongarch64/asum_lasx.S
+++ b/kernel/loongarch64/asum_lasx.S
@ -0,0 +1,257 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define VT0    $xr23
+#define VT1    $xr22
+#define res1   $xr16
+#define res2   $xr17
+#define res0   $xr18
+#define neg1   $xr19
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    xvreplgr2vr.d neg1, t1
+    xvffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    xvreplgr2vr.w neg1, t1
+    xvffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD    $f12, X, 0 * SIZE
+    FABS  $f12, $f12
+    ADD   $f16, $f12,  $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmul.s VX2, neg1, VX0
+    xvfcmp.clt.s VT0, VX0, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvfadd.s res1, VX0, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV    $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/asum_lsx.S
+++ b/kernel/loongarch64/asum_lsx.S
@ -0,0 +1,258 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define VT0    $vr23
+#define VT1    $vr22
+#define res1   $vr16
+#define res2   $vr17
+#define res0   $vr18
+#define neg1   $vr19
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    vreplgr2vr.d neg1, t1
+    vffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    vreplgr2vr.w neg1, t1
+    vffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, SIZE
+    slli.d  INCX, INCX, BASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    addi.d  X, X, SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t2, X, 0 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.d t4, X, 0 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   $f12, X, 0 * SIZE
+    FABS $f12, $f12
+    ADD  $f16, $f12, $f16
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV    $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/daxpby_lasx.S
+++ b/kernel/loongarch64/daxpby_lasx.S
@ -1,6 +1,33 @@
-#define ASSEMBLER
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/

+#define ASSEMBLER
 #include "common.h"
+
 #define N      $r4
 #define ALPHA  $f0
 #define X      $r5
@ -30,18 +57,29 @@
    PROLOGUE

    bge $r0, N, .L999
-    li.d TEMP, 1
    movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
+    ffint.s.l a1, a1
    slli.d  INCX, INCX, BASE_SHIFT
    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.d t1, ALPHA
+    MTG  t1, ALPHA
+    MTG  t2, BETA
+    MTG  t3, a1
+#ifdef DOUBLE
    xvreplgr2vr.d VXA, t1
-    movfr2gr.d t2, BETA
    xvreplgr2vr.d VXB, t2
-    movfr2gr.d t3, a1
    xvreplgr2vr.d VXZ, t3
+#else
+    xvreplgr2vr.w VXA, t1
+    xvreplgr2vr.w VXB, t2
+    xvreplgr2vr.w VXZ, t3
+#endif
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
    srai.d I, N, 3
    bne INCX, TEMP, .L20
    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
@ -52,21 +90,22 @@

 .L11:
    bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
    bcnez $fcc0, .L110
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
    b .L111 // ALPHA!=0 BETA!=0
    .align 3

 .L110:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
    b .L113 // ALPHA==0 BETA!=0
    .align 3

 .L111: // ALPHA!=0 BETA!=0
    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
    xvld VX2, Y, 0 * SIZE
    xvld VX1, X, 4 * SIZE
    xvld VX3, Y, 4 * SIZE
@ -77,6 +116,13 @@
    addi.d  I, I, -1
    xvst VX2, Y, 0 * SIZE
    xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvst VX2, Y, 0 * SIZE
+#endif
    addi.d X, X, 8 * SIZE
    addi.d Y, Y, 8 * SIZE
    blt $r0, I, .L111
@ -85,34 +131,46 @@

 .L112: // ALPHA!=0 BETA==0
    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
    xvld VX1, X, 4 * SIZE
    xvfmul.d VX0, VX0, VXA
    xvfmul.d VX1, VX1, VXA
    xvst VX0, Y, 0 * SIZE
    xvst VX1, Y, 4 * SIZE
+#else
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+#endif
    addi.d X, X, 8 * SIZE
    addi.d Y, Y, 8 * SIZE
-    addi.d  I, I, -1
    blt $r0, I, .L112
    b .L997
    .align 3

 .L113: // ALPHA==0 BETA!=0
    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
    xvld VX3, Y, 4 * SIZE
    xvfmul.d VX2, VX2, VXB
    xvfmul.d VX3, VX3, VXB
    xvst VX2, Y, 0 * SIZE
    xvst VX3, Y, 4 * SIZE
-    addi.d Y, Y, 8 * SIZE
+#else
+    xvfmul.s VX2, VX2, VXB
+    xvst VX2, Y, 0 * SIZE
+#endif
    addi.d  I, I, -1
+    addi.d Y, Y, 8 * SIZE
    blt $r0, I, .L113
    b .L997
    .align 3

 .L114: // ALPHA==0 BETA==0
    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
    xvst VXZ, Y, 4 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    addi.d  I, I, -1
    blt $r0, I, .L114
@ -122,21 +180,22 @@
 .L12: // INCX==1 and INCY!=1
    bge $r0, I, .L997
    move YY, Y
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
    bcnez $fcc0, .L120
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
    b .L121 // ALPHA!=0 BETA!=0
    .align 3

 .L120:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
    b .L123 // ALPHA==0 BETA!=0
    .align 3

 .L121: // ALPHA!=0 BETA!=0
    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
    ld.d t1, Y, 0 * SIZE
    add.d Y, Y, INCY
    ld.d t2, Y, 0 * SIZE
@ -182,14 +241,59 @@
    xvstelm.d VX3, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
    add.d YY, YY, INCY
    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
    blt $r0, I, .L121
    b .L997
    .align 3

 .L122: // ALPHA!=0 BETA==0
    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
    xvld VX1, X, 4 * SIZE
    xvfmul.d VX0, VX0, VXA
    xvfmul.d VX1, VX1, VXA
@ -208,14 +312,33 @@
    xvstelm.d VX1, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VX1, YY, 0, 3
+#else
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+#endif
    add.d YY, YY, INCY
    addi.d X, X, 8 * SIZE
-    addi.d  I, I, -1
    blt $r0, I, .L122
    b .L997
    .align 3

 .L123: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
    ld.d t1, Y, 0 * SIZE
    add.d Y, Y, INCY
    ld.d t2, Y, 0 * SIZE
@ -250,7 +373,6 @@
    xvstelm.d VX2, YY, 0, 3
    add.d YY, YY, INCY
    xvfmul.d VX3, VX3, VXB
-    addi.d  I, I, -1
    xvstelm.d VX3, YY, 0, 0
    add.d YY, YY, INCY
    xvstelm.d VX3, YY, 0, 1
@ -258,12 +380,56 @@
    xvstelm.d VX3, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    xvstelm.w VX2, YY, 0, 0
    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
+    add.d YY, YY, INCY
+    addi.d  I, I, -1
    blt $r0, I, .L123
    b .L997
    .align 3

 .L124: // ALPHA==0 BETA==0
+#ifdef DOUBLE
    xvstelm.d VXZ, YY, 0, 0
    add.d YY, YY, INCY
    xvstelm.d VXZ, YY, 0, 1
@ -279,6 +445,23 @@
    xvstelm.d VXZ, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
    add.d YY, YY, INCY
    addi.d  I, I, -1
    blt $r0, I, .L124
@ -287,21 +470,22 @@

 .L21:// INCX!=1 and INCY==1
    bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
    bcnez $fcc0, .L210
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
    b .L211 // ALPHA!=0 BETA!=0
    .align 3

 .L210:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
    b .L213 // ALPHA==0 BETA!=0
    .align 3

 .L211: // ALPHA!=0 BETA!=0
    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -334,12 +518,43 @@
    xvfmadd.d VX3, VX3, VXB, VX1
    addi.d  I, I, -1
    xvst VX3, Y, 4 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    blt $r0, I, .L211
    b .L997
    .align 3

 .L212: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -369,6 +584,35 @@
    xvfmul.d VX1, VX1, VXA
    addi.d  I, I, -1
    xvst VX1, Y, 4 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VXA, VX0
+    addi.d  I, I, -1
+    xvst VX0, Y, 0 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    blt $r0, I, .L212
    b .L997
@ -376,20 +620,27 @@

 .L213: // ALPHA==0 BETA!=0
    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
    xvld VX3, Y, 4 * SIZE
    xvfmul.d VX2, VX2, VXB
    xvfmul.d VX3, VX3, VXB
-    addi.d  I, I, -1
    xvst VX2, Y, 0 * SIZE
    xvst VX3, Y, 4 * SIZE
+#else
+    xvfmul.s VX2, VX2, VXB
+    xvst VX2, Y, 0 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
    blt $r0, I, .L213
    b .L997
    .align 3

 .L214: // ALPHA==0 BETA==0
    xvst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
    xvst VXZ, Y, 4 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    addi.d  I, I, -1
    blt $r0, I, .L214
@ -399,20 +650,21 @@
 .L22:
    bge $r0, I, .L997
    move YY, Y
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
    bcnez $fcc0, .L220
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
    b .L221 // ALPHA!=0 BETA!=0
    .align 3

 .L220:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
    b .L223 // ALPHA==0 BETA!=0
    .align 3

 .L221: // ALPHA!=0 BETA!=0
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -481,12 +733,81 @@
    xvstelm.d VX3, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX0, VX0, VXA
+    xvfmadd.s VX2, VX2, VXB, VX0
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
    add.d YY, YY, INCY
    blt $r0, I, .L221
    b .L997
    .align 3

 .L222: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -529,12 +850,56 @@
    xvstelm.d VX1, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VX1, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    add.d X, X, INCX
+    xvfmul.s VX0, VX0, VXA
+    addi.d  I, I, -1
+    xvstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX0, YY, 0, 7
+#endif
    add.d YY, YY, INCY
    blt $r0, I, .L222
    b .L997
    .align 3

 .L223: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
    ld.d t1, Y, 0 * SIZE
    add.d Y, Y, INCY
    ld.d t2, Y, 0 * SIZE
@ -577,12 +942,56 @@
    xvstelm.d VX3, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmul.s VX2, VX2, VXB
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
    add.d YY, YY, INCY
    blt $r0, I, .L223
    b .L997
    .align 3

 .L224: // ALPHA==0 BETA==0
+#ifdef DOUBLE
    xvstelm.d VXZ, YY, 0, 0
    add.d YY, YY, INCY
    xvstelm.d VXZ, YY, 0, 1
@ -598,6 +1007,23 @@
    xvstelm.d VXZ, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VXZ, YY, 0, 3
+#else
+    xvstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VXZ, YY, 0, 7
+#endif
    add.d YY, YY, INCY
    addi.d  I, I, -1
    blt $r0, I, .L224
@ -610,12 +1036,12 @@
    .align 3

 .L998:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f13, Y, 0 * SIZE
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
    addi.d I, I, -1
-    fmul.d $f12, $f12, ALPHA
-    fmadd.d $f13, $f13, BETA, $f12
-    fst.d $f13, Y, 0 * SIZE
+    MUL  $f12, $f12, ALPHA
+    MADD $f13, $f13, BETA, $f12
+    ST   $f13, Y, 0 * SIZE
    add.d  X, X, INCX
    add.d  Y, Y, INCY
    blt $r0, I, .L998
--- a/kernel/loongarch64/daxpby_lsx.S
+++ b/kernel/loongarch64/daxpby_lsx.S
@ -1,6 +1,33 @@
-#define ASSEMBLER
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/

+#define ASSEMBLER
 #include "common.h"
+
 #define N      $r4
 #define ALPHA  $f0
 #define X      $r5
@ -30,18 +57,29 @@
    PROLOGUE

    bge $r0, N, .L999
-    li.d TEMP, 1
    movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
-    slli.d  TEMP, TEMP, BASE_SHIFT
+    ffint.s.l a1, a1
    slli.d  INCX, INCX, BASE_SHIFT
    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.d t1, ALPHA
+    MTG  t1, ALPHA
+    MTG  t2, BETA
+    MTG  t3, a1
+#ifdef DOUBLE
    vreplgr2vr.d VXA, t1
-    movfr2gr.d t2, BETA
    vreplgr2vr.d VXB, t2
-    movfr2gr.d t3, a1
    vreplgr2vr.d VXZ, t3
+#else
+    vreplgr2vr.w VXA, t1
+    vreplgr2vr.w VXB, t2
+    vreplgr2vr.w VXZ, t3
+#endif
+    // If incx == 0 || incy == 0, do one by one
+    and TEMP, INCX, INCY
+    or  I,    N,    N
+    beqz TEMP, .L998
+
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, BASE_SHIFT
    srai.d I, N, 3
    bne INCX, TEMP, .L20
    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
@ -52,15 +90,15 @@

 .L11:
    bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
    bcnez $fcc0, .L110
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L112 // ALPHA!=0 BETA==0
    b .L111 // ALPHA!=0 BETA!=0
    .align 3

 .L110:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L114 // ALPHA==0 BETA==0
    b .L113 // ALPHA==0 BETA!=0
    .align 3
@ -68,6 +106,7 @@
 .L111: // ALPHA!=0 BETA!=0
    vld VX0, X, 0 * SIZE
    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
    vld VX1, X, 2 * SIZE
    vld VX3, Y, 2 * SIZE
    vfmul.d VX0, VX0, VXA
@ -86,6 +125,16 @@
    vfmadd.d VX3, VX3, VXB, VX1
    vst VX2, Y, 4 * SIZE
    vst VX3, Y, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX2, VX2, VXB, VX0
+    vfmadd.s VX3, VX3, VXB, VX1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
    addi.d X, X, 8 * SIZE
    addi.d Y, Y, 8 * SIZE
    addi.d  I, I, -1
@ -95,6 +144,7 @@

 .L112: // ALPHA!=0 BETA==0
    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
    vld VX1, X, 2 * SIZE
    vfmul.d VX0, VX0, VXA
    vfmul.d VX1, VX1, VXA
@ -106,6 +156,13 @@
    vfmul.d VX3, VX3, VXA
    vst VX2, Y, 4 * SIZE
    vst VX3, Y, 6 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+#endif
    addi.d X, X, 8 * SIZE
    addi.d Y, Y, 8 * SIZE
    addi.d  I, I, -1
@ -113,7 +170,8 @@
    b .L997
    .align 3

-.L113: // ALPHA==0 BETA!=0\
+.L113: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
    vld VX0, Y, 0 * SIZE
    vld VX1, Y, 2 * SIZE
    vfmul.d VX0, VX0, VXB
@ -126,6 +184,14 @@
    vfmul.d VX3, VX3, VXB
    vst VX2, Y, 4 * SIZE
    vst VX3, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    addi.d  I, I, -1
    blt $r0, I, .L113
@ -134,9 +200,13 @@

 .L114: // ALPHA==0 BETA==0
    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
    vst VXZ, Y, 2 * SIZE
    vst VXZ, Y, 4 * SIZE
    vst VXZ, Y, 6 * SIZE
+#else
+    vst VXZ, Y, 4 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    addi.d  I, I, -1
    blt $r0, I, .L114
@ -146,21 +216,22 @@
 .L12: // INCX==1 and INCY!=1
    bge $r0, I, .L997
    move YY, Y
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
    bcnez $fcc0, .L120
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L122 // ALPHA!=0 BETA==0
    b .L121 // ALPHA!=0 BETA!=0
    .align 3

 .L120:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L124 // ALPHA==0 BETA==0
    b .L123 // ALPHA==0 BETA!=0
    .align 3

 .L121: // ALPHA!=0 BETA!=0
    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
    ld.d t1, Y, 0 * SIZE
    add.d Y, Y, INCY
    ld.d t2, Y, 0 * SIZE
@ -212,6 +283,53 @@
    vstelm.d VX3, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    vld VX1, X, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
    add.d YY, YY, INCY
    addi.d X, X, 8 * SIZE
    blt $r0, I, .L121
@ -220,6 +338,7 @@

 .L122: // ALPHA!=0 BETA==0
    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
    vld VX1, X, 2 * SIZE
    vfmul.d VX0, VX0, VXA
    vfmul.d VX1, VX1, VXA
@ -242,6 +361,26 @@
    vstelm.d VX1, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VX1, YY, 0, 1
+#else
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX0, VX0, VXA
+    vfmul.s VX1, VX1, VXA
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+#endif
    add.d YY, YY, INCY
    addi.d X, X, 8 * SIZE
    addi.d  I, I, -1
@ -250,6 +389,7 @@
    .align 3

 .L123: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
    ld.d t1, Y, 0 * SIZE
    add.d Y, Y, INCY
    ld.d t2, Y, 0 * SIZE
@ -294,12 +434,57 @@
    vstelm.d VX3, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
    add.d YY, YY, INCY
    blt $r0, I, .L123
    b .L997
    .align 3

 .L124: // ALPHA==0 BETA==0
+#ifdef DOUBLE
    vstelm.d VXZ, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VXZ, YY, 0, 1
@ -315,6 +500,23 @@
    vstelm.d VXZ, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
    add.d YY, YY, INCY
    addi.d  I, I, -1
    blt $r0, I, .L124
@ -323,21 +525,22 @@

 .L21:// INCX!=1 and INCY==1
    bge $r0, I, .L997
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
    bcnez $fcc0, .L210
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L212 // ALPHA!=0 BETA==0
    b .L211 // ALPHA!=0 BETA!=0
    .align 3

 .L210:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L214 // ALPHA==0 BETA==0
    b .L213 // ALPHA==0 BETA!=0
    .align 3

 .L211: // ALPHA!=0 BETA!=0
    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -378,12 +581,47 @@
    vfmadd.d VX3, VX3, VXB, VX1
    addi.d  I, I, -1
    vst VX3, Y, 6 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX2, VXB, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX2, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    vfmadd.s VX3, VX3, VXB, VX1
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    blt $r0, I, .L211
    b .L997
    .align 3

 .L212: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -417,6 +655,37 @@
    vfmul.d VX1, VX1, VXA
    addi.d  I, I, -1
    vst VX1, Y, 6 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VXA, VX0
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vst VX0, Y, 0 * SIZE
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vst VX1, Y, 4 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    blt $r0, I, .L212
    b .L997
@ -424,6 +693,7 @@

 .L213: // ALPHA==0 BETA!=0
    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
    vld VX3, Y, 2 * SIZE
    vfmul.d VX2, VX2, VXB
    vfmul.d VX3, VX3, VXB
@ -433,19 +703,30 @@
    vld VX3, Y, 6 * SIZE
    vfmul.d VX2, VX2, VXB
    vfmul.d VX3, VX3, VXB
-    addi.d  I, I, -1
    vst VX2, Y, 4 * SIZE
    vst VX3, Y, 6 * SIZE
+#else
+    vld VX3, Y, 4 * SIZE
+    vfmul.s VX2, VX2, VXB
+    vfmul.s VX3, VX3, VXB
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
    blt $r0, I, .L213
    b .L997
    .align 3

 .L214: // ALPHA==0 BETA==0
    vst VXZ, Y, 0 * SIZE
+#ifdef DOUBLE
    vst VXZ, Y, 2 * SIZE
    vst VXZ, Y, 4 * SIZE
    vst VXZ, Y, 6 * SIZE
+#else
+    vst VXZ, Y, 4 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    addi.d  I, I, -1
    blt $r0, I, .L214
@ -455,20 +736,21 @@
 .L22:
    bge $r0, I, .L997
    move YY, Y
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    CMPEQ $fcc0, ALPHA, a1
    bcnez $fcc0, .L220
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L222 // ALPHA!=0 BETA==0
    b .L221 // ALPHA!=0 BETA!=0
    .align 3

 .L220:
-    fcmp.ceq.d $fcc0, BETA, a1
+    CMPEQ $fcc0, BETA, a1
    bcnez $fcc0, .L224 // ALPHA==0 BETA==0
    b .L223 // ALPHA==0 BETA!=0
    .align 3

 .L221: // ALPHA!=0 BETA!=0
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -541,12 +823,83 @@
    vstelm.d VX3, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vfmadd.s VX2, VX2, VXB, VX0
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vfmadd.s VX3, VX3, VXB, VX1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
    add.d YY, YY, INCY
    blt $r0, I, .L221
    b .L997
    .align 3

 .L222: // ALPHA!=0 BETA==0
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -591,12 +944,57 @@
    vstelm.d VX1, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VX1, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmul.s VX0, VX0, VXA
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX0, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX0, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX1, VX1, VXA
+    addi.d  I, I, -1
+    vstelm.w VX1, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX1, YY, 0, 3
+#endif
    add.d YY, YY, INCY
    blt $r0, I, .L222
    b .L997
    .align 3

 .L223: // ALPHA==0 BETA!=0
+#ifdef DOUBLE
    ld.d t1, Y, 0 * SIZE
    add.d Y, Y, INCY
    ld.d t2, Y, 0 * SIZE
@ -641,12 +1039,57 @@
    vstelm.d VX3, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmul.s VX2, VX2, VXB
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    vfmul.s VX3, VX3, VXB
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
    add.d YY, YY, INCY
    blt $r0, I, .L223
    b .L997
    .align 3

 .L224: // ALPHA==0 BETA==0
+#ifdef DOUBLE
    vstelm.d VXZ, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VXZ, YY, 0, 1
@ -662,6 +1105,23 @@
    vstelm.d VXZ, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VXZ, YY, 0, 1
+#else
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VXZ, YY, 0, 3
+#endif
    add.d YY, YY, INCY
    addi.d  I, I, -1
    blt $r0, I, .L224
@ -674,12 +1134,12 @@
    .align 3

 .L998:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f13, Y, 0 * SIZE
+    LD   $f12, X, 0 * SIZE
+    LD   $f13, Y, 0 * SIZE
    addi.d I, I, -1
-    fmul.d $f12, $f12, ALPHA
-    fmadd.d $f13, $f13, BETA, $f12
-    fst.d $f13, Y, 0 * SIZE
+    MUL  $f12, $f12, ALPHA
+    MADD $f13, $f13, BETA, $f12
+    ST   $f13, Y, 0 * SIZE
    add.d  X, X, INCX
    add.d  Y, Y, INCY
    blt $r0, I, .L998
--- a/kernel/loongarch64/daxpy_lasx.S
+++ b/kernel/loongarch64/daxpy_lasx.S
@ -1,6 +1,33 @@
-#define ASSEMBLER
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/

+#define ASSEMBLER
 #include "common.h"
+
 #define N      $r4
 #define XX     $r5
 #define YY     $r6
@ -35,16 +62,20 @@
    bge $r0, N, .L999
    li.d TEMP, 1
    movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
+    FFINT   a1,  a1
    movgr2fr.d a2, TEMP
-    ffint.d.l a2, a2
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    FFINT   a2,  a2
+    CMPEQ   $fcc0, ALPHA, a1
    bcnez $fcc0, .L999
    slli.d  TEMP, TEMP, BASE_SHIFT
    slli.d  INCX, INCX, BASE_SHIFT
    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.d t1, ALPHA
+    MTG t1, ALPHA
+#ifdef DOUBLE
    xvreplgr2vr.d VXA, t1
+#else
+    xvreplgr2vr.w VXA, t1
+#endif

    srai.d I, N, 3
    bne INCX, TEMP, .L20
@ -56,11 +87,12 @@

 .L11:
    bge $r0, I, .L113
-    fcmp.ceq.d $fcc0, ALPHA, a2
+    CMPEQ $fcc0, ALPHA, a2
    bceqz $fcc0, .L112
    .align 3

 .L111:
+#ifdef DOUBLE
    xvld VX0, X, 0 * SIZE
    xvld VX2, Y, 0 * SIZE
    xvld VX1, X, 4 * SIZE
@ -70,6 +102,13 @@
    addi.d  I, I, -1
    xvst VX2, Y, 0 * SIZE
    xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfadd.s VX2, VX0, VX2
+    xvst VX2, Y, 0 * SIZE
+#endif
    addi.d X, X, 8 * SIZE
    addi.d Y, Y, 8 * SIZE
    blt $r0, I, .L111
@ -77,6 +116,7 @@
    .align 3

 .L112:
+#ifdef DOUBLE
    xvld VX0, X, 0 * SIZE
    xvld VX2, Y, 0 * SIZE
    xvld VX1, X, 4 * SIZE
@ -86,6 +126,13 @@
    addi.d  I, I, -1
    xvst VX2, Y, 0 * SIZE
    xvst VX3, Y, 4 * SIZE
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    addi.d  I, I, -1
+    xvfmadd.s VX2, VX0, VXA, VX2
+    xvst VX2, Y, 0 * SIZE
+#endif
    addi.d X, X, 8 * SIZE
    addi.d Y, Y, 8 * SIZE
    blt $r0, I, .L112
@ -97,11 +144,11 @@
    .align 3

 .L114:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
    addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
    addi.d  X, X, SIZE
    addi.d  Y, Y, SIZE
    blt $r0, I, .L114
@ -114,6 +161,7 @@
    .align 3

 .L121:
+#ifdef DOUBLE
    xvld VX0, X, 0 * SIZE
    ld.d t1, Y, 0 * SIZE
    add.d Y, Y, INCY
@ -158,6 +206,50 @@
    xvstelm.d VX3, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VX3, YY, 0, 3
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
    add.d YY, YY, INCY
    addi.d X, X, 8 * SIZE
    blt $r0, I, .L121
@ -169,11 +261,11 @@
    .align 3

 .L123:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
    addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
    addi.d  X, X, SIZE
    add.d  Y, Y, INCY
    blt $r0, I, .L123
@ -185,6 +277,7 @@
    .align 3

 .L211:
+#ifdef DOUBLE
    xvld VX2, Y, 0 * SIZE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
@ -217,6 +310,37 @@
    addi.d  I, I, -1
    xvst VX3, Y, 4 * SIZE
    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
    blt $r0, I, .L211
    .align 3

@ -226,11 +350,11 @@
    .align 3

 .L213:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
    addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
    add.d X, X, INCX
    addi.d Y, Y, SIZE
    blt $r0, I, .L213
@ -243,6 +367,7 @@
    .align 3

 .L222:
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -309,6 +434,73 @@
    xvstelm.d VX3, YY, 0, 2
    add.d YY, YY, INCY
    xvstelm.d VX3, YY, 0, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w VX2, t1, 0
+    xvinsgr2vr.w VX2, t2, 1
+    xvinsgr2vr.w VX2, t3, 2
+    xvinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    xvinsgr2vr.w VX2, t1, 4
+    xvinsgr2vr.w VX2, t2, 5
+    xvinsgr2vr.w VX2, t3, 6
+    xvinsgr2vr.w VX2, t4, 7
+    add.d Y, Y, INCY
+    xvfmadd.s VX2, VX0, VXA, VX2
+    addi.d  I, I, -1
+    xvstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 4
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 5
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 6
+    add.d YY, YY, INCY
+    xvstelm.w VX2, YY, 0, 7
+#endif
    add.d YY, YY, INCY
    blt $r0, I, .L222
    .align 3
@ -319,15 +511,14 @@
    .align 3

 .L224:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
    addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
    add.d X, X, INCX
    add.d Y, Y, INCY
    blt $r0, I, .L224
-    b .L999
    .align 3

 .L999:
--- a/kernel/loongarch64/daxpy_lsx.S
+++ b/kernel/loongarch64/daxpy_lsx.S
@ -1,6 +1,33 @@
-#define ASSEMBLER
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/

+#define ASSEMBLER
 #include "common.h"
+
 #define N      $r4
 #define XX     $r5
 #define YY     $r6
@ -35,16 +62,20 @@
    bge $r0, N, .L999
    li.d TEMP, 1
    movgr2fr.d a1, $r0
-    ffint.d.l a1, a1
+    FFINT   a1,  a1
    movgr2fr.d a2, TEMP
-    ffint.d.l a2, a2
-    fcmp.ceq.d $fcc0, ALPHA, a1
+    FFINT   a2,  a2
+    CMPEQ   $fcc0, ALPHA, a1
    bcnez $fcc0, .L999
    slli.d  TEMP, TEMP, BASE_SHIFT
    slli.d  INCX, INCX, BASE_SHIFT
    slli.d  INCY, INCY, BASE_SHIFT
-    movfr2gr.d t1, ALPHA
+    MTG t1, ALPHA
+#ifdef DOUBLE
    vreplgr2vr.d VXA, t1
+#else
+    vreplgr2vr.w VXA, t1
+#endif

    srai.d I, N, 3
    bne INCX, TEMP, .L20
@ -56,11 +87,12 @@

 .L11:
    bge $r0, I, .L113
-    fcmp.ceq.d $fcc0, ALPHA, a2
+    CMPEQ $fcc0, ALPHA, a2
    bceqz $fcc0, .L112
    .align 3

 .L111:
+#ifdef DOUBLE
    vld VX0, X, 0 * SIZE
    vld VX2, Y, 0 * SIZE
    vld VX1, X, 2 * SIZE
@ -75,16 +107,27 @@
    vld VX3, Y, 6 * SIZE
    vfadd.d VX2, VX0, VX2
    vfadd.d VX3, VX1, VX3
-    addi.d  I, I, -1
    vst VX2, Y, 4 * SIZE
    vst VX3, Y, 6 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfadd.s VX2, VX0, VX2
+    vfadd.s VX3, VX1, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
    addi.d X, X, 8 * SIZE
    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
    blt $r0, I, .L111
    b .L113
    .align 3

 .L112:
+#ifdef DOUBLE
    vld VX0, X, 0 * SIZE
    vld VX2, Y, 0 * SIZE
    vld VX1, X, 2 * SIZE
@ -104,6 +147,19 @@
    vst VX2, Y, 4 * SIZE
    vst VX3, Y, 6 * SIZE
    addi.d Y, Y, 8 * SIZE
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vfmadd.s VX2, VX0, VXA, VX2
+    vfmadd.s VX3, VX1, VXA, VX3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+#endif
    blt $r0, I, .L112
    .align 3

@ -113,11 +169,11 @@
    .align 3

 .L114:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
    addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
    addi.d  X, X, SIZE
    addi.d  Y, Y, SIZE
    blt $r0, I, .L114
@ -130,6 +186,7 @@
    .align 3

 .L121:
+#ifdef DOUBLE
    vld VX0, X, 0 * SIZE
    ld.d t1, Y, 0 * SIZE
    add.d Y, Y, INCY
@ -180,6 +237,54 @@
    add.d YY, YY, INCY
    addi.d X, X, 8 * SIZE
    addi.d  I, I, -1
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX1, X, 4 * SIZE
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+#endif
    blt $r0, I, .L121
    .align 3

@ -189,11 +294,11 @@
    .align 3

 .L123:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
    addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
    addi.d  X, X, SIZE
    add.d  Y, Y, INCY
    blt $r0, I, .L123
@ -205,6 +310,7 @@
    .align 3

 .L211:
+#ifdef DOUBLE
    vld VX2, Y, 0 * SIZE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
@ -242,6 +348,39 @@
    vfmadd.d VX3, VX1, VXA, VX3
    addi.d  I, I, -1
    vst VX3, Y, 6 * SIZE
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX2, VX0, VXA, VX2
+    vld VX3, Y, 4 * SIZE
+    vst VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    add.d X, X, INCX
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vst VX3, Y, 4 * SIZE
+#endif
    addi.d Y, Y, 8 * SIZE
    blt $r0, I, .L211
    .align 3
@ -252,11 +391,11 @@
    .align 3

 .L213:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
    addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
    add.d X, X, INCX
    addi.d Y, Y, SIZE
    blt $r0, I, .L213
@ -269,6 +408,7 @@
    .align 3

 .L222:
+#ifdef DOUBLE
    ld.d t1, X, 0 * SIZE
    add.d X, X, INCX
    ld.d t2, X, 0 * SIZE
@ -337,6 +477,74 @@
    vstelm.d VX3, YY, 0, 0
    add.d YY, YY, INCY
    vstelm.d VX3, YY, 0, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    add.d X, X, INCX
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX2, VX0, VXA, VX2
+    ld.w t1, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t2, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    add.d X, X, INCX
+    ld.w t4, X, 0 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vstelm.w VX2, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX2, YY, 0, 3
+    add.d YY, YY, INCY
+    ld.w t1, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t2, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    add.d Y, Y, INCY
+    ld.w t4, Y, 0 * SIZE
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    add.d Y, Y, INCY
+    vfmadd.s VX3, VX1, VXA, VX3
+    addi.d  I, I, -1
+    vstelm.w VX3, YY, 0, 0
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 1
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 2
+    add.d YY, YY, INCY
+    vstelm.w VX3, YY, 0, 3
+#endif
    add.d YY, YY, INCY
    blt $r0, I, .L222
    .align 3
@ -347,11 +555,11 @@
    .align 3

 .L224:
-    fld.d $f12, X, 0 * SIZE
-    fld.d $f14, Y, 0 * SIZE
+    LD  $f12, X, 0 * SIZE
+    LD  $f14, Y, 0 * SIZE
    addi.d I, I, -1
-    fmadd.d $f14, $f12, $f0, $f14
-    fst.d $f14, Y, 0 * SIZE
+    MADD $f14, $f12, $f0, $f14
+    ST  $f14, Y, 0 * SIZE
    add.d X, X, INCX
    add.d Y, Y, INCY
    blt $r0, I, .L224
--- a/kernel/loongarch64/camax_lasx.S
+++ b/kernel/loongarch64/camax_lasx.S
@ -0,0 +1,212 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $xr9
+#define x2   $xr10
+#define x3   $xr11
+#define x4   $xr12
+#define VT0  $xr13
+#define VT1  $xr14
+#define res0 $xr18
+#define neg1 $xr19
+#define VX0  $xr20
+#define VX1  $xr21
+#define VM0  $xr22
+#define VM1  $xr23
+
+    PROLOGUE
+    xvxor.v VM0, VM0, VM0
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+#ifdef DOUBLE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMAX VM0, VM0, VM1
+#ifdef DOUBLE
+    xvld VX0, X, 64
+    xvld VX1, X, 96
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMAX VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    XVFMAX VM0, x1, x2
+#else
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    XVFMAX VM0, x1, x2
+    XVFMAX VM1, x3, x4
+    XVFMAX VM0, VM0, VM1
+#endif
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    addi.d I, I, -1
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    FMAX s1, s1, s2
+    FMAX s3, s3, s4
+    FMAX s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    FMAX s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/camax_lsx.S
+++ b/kernel/loongarch64/camax_lsx.S
@ -0,0 +1,239 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $vr9
+#define x2   $vr10
+#define x3   $vr11
+#define x4   $vr12
+#define VT0  $vr13
+#define VT1  $vr14
+#define res0 $vr18
+#define neg1 $vr19
+#define VX0  $vr20
+#define VX1  $vr21
+#define VM0  $vr22
+#define VM1  $vr23
+
+    PROLOGUE
+    vxor.v VM0, VM0, VM0
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    vld VX0, X, 0
+    vld VX1, X, 16
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 32
+    vld VX1, X, 48
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMAX VM1, x1, VM1
+    VFMAX VM0, VM0, VM1
+#ifdef DOUBLE
+    vld VX0, X, 64
+    vld VX1, X, 80
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 96
+    vld VX1, X, 112
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMAX VM1, x1, VM1
+    VFMAX VM0, VM0, VM1
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d I, I, -1
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    VFMAX VM0, x1, x2
+#else
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VFMAX VM1, x1, x2
+    VFMAX VM0, x3, x4
+    VFMAX VM0, VM0, VM1
+#endif
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    addi.d I, I, -1
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMAX s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    FMAX s1, s1, s2
+    FMAX s3, s3, s4
+    FMAX s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    FMAX s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/camin_lasx.S
+++ b/kernel/loongarch64/camin_lasx.S
@ -0,0 +1,221 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define TEMP $r16
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define a0   $f20
+#define a1   $f21
+#define x1   $xr9
+#define x2   $xr10
+#define x3   $xr11
+#define x4   $xr12
+#define VT0  $xr13
+#define VT1  $xr14
+#define res0 $xr18
+#define neg1 $xr19
+#define VX0  $xr20
+#define VX1  $xr21
+#define VM0  $xr22
+#define VM1  $xr23
+
+    PROLOGUE
+    MTC s1, $r0
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD s1, a1, a0
+#ifdef DOUBLE
+    xvreplve0.d VM0, VM0
+#else
+    xvreplve0.w VM0, VM0
+#endif
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    xvld VX0, X, 0
+    xvld VX1, X, 32
+#ifdef DOUBLE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMIN VM0, VM0, VM1
+#ifdef DOUBLE
+    xvld VX0, X, 64
+    xvld VX1, X, 96
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    XVFSUB x3, res0, x1
+    XVFSUB x4, res0, x2
+    XVFMAX x1, x1, x3
+    XVFMAX x2, x2, x4
+    XVFADD VM1, x1, x2
+    XVFMIN VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvpickve.d x1, VM0, 0
+    xvpickve.d x2, VM0, 1
+    XVFMIN VM0, x1, x2
+#else
+    xvpickve.w x1, VM0, 0
+    xvpickve.w x2, VM0, 1
+    xvpickve.w x3, VM0, 2
+    xvpickve.w x4, VM0, 3
+    XVFMIN VM0, x1, x2
+    XVFMIN VM1, x3, x4
+    XVFMIN VM0, VM0, VM1
+#endif
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    addi.d I, I, -1
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    FMIN s1, s1, s2
+    FMIN s3, s3, s4
+    FMIN s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    FMIN s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/camin_lsx.S
+++ b/kernel/loongarch64/camin_lsx.S
@ -0,0 +1,248 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N    $r4
+#define X    $r5
+#define INCX $r6
+#define I    $r12
+#define t1   $f14
+#define t2   $f18
+#define t3   $f15
+#define t4   $f17
+#define s1   $f22
+#define s2   $f9
+#define s3   $f10
+#define s4   $f11
+#define TEMP $r16
+#define a0   $f20
+#define a1   $f21
+#define x1   $vr9
+#define x2   $vr10
+#define x3   $vr11
+#define x4   $vr12
+#define VT0  $vr13
+#define VT1  $vr14
+#define res0 $vr18
+#define neg1 $vr19
+#define VX0  $vr20
+#define VX1  $vr21
+#define VM0  $vr22
+#define VM1  $vr23
+
+    PROLOGUE
+    MTC s1, $r0
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    FABS a0, a0
+    FABS a1, a1
+    ADD s1, a1, a0
+#ifdef DOUBLE
+    vreplvei.d VM0, VM0, 0
+#else
+    vreplvei.w VM0, VM0, 0
+#endif
+    li.d TEMP, 1
+    slli.d TEMP, TEMP, ZBASE_SHIFT
+    slli.d INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L23
+    .align 3
+
+.L10:
+    vld VX0, X, 0
+    vld VX1, X, 16
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 32
+    vld VX1, X, 48
+#ifdef DOUBLE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMIN VM1, x1, VM1
+    VFMIN VM0, VM0, VM1
+#ifdef DOUBLE
+    vld VX0, X, 64
+    vld VX1, X, 80
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD VM1, x1, x2
+
+    vld VX0, X, 96
+    vld VX1, X, 112
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    VFSUB x3, res0, x1
+    VFSUB x4, res0, x2
+    VFMAX x1, x1, x3
+    VFMAX x2, x2, x4
+    VFADD x1, x1, x2
+    VFMIN VM1, x1, VM1
+    VFMIN VM0, VM0, VM1
+#endif
+    addi.d I, I, -1
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L10
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vreplvei.d x1, VM0, 0
+    vreplvei.d x2, VM0, 1
+    VFMIN VM0, x1, x2
+#else
+    vreplvei.w x1, VM0, 0
+    vreplvei.w x2, VM0, 1
+    vreplvei.w x3, VM0, 2
+    vreplvei.w x4, VM0, 3
+    VFMIN VM1, x1, x2
+    VFMIN VM0, x3, x4
+    VFMIN VM0, VM0, VM1
+#endif
+    b .L23
+    .align 3
+
+.L20: // INCX!=1
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s1, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    addi.d I, I, -1
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s3, t1, t3
+    LD t1, X, 0 * SIZE
+    LD t2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD t3, X, 0 * SIZE
+    LD t4, X, 1 * SIZE
+    add.d X, X, INCX
+    FABS t1, t1
+    FABS t2, t2
+    FABS t3, t3
+    FABS t4, t4
+    ADD t1, t1, t2
+    ADD t3, t3, t4
+    FMIN s4, t1, t3
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+    FMIN s1, s1, s2
+    FMIN s3, s3, s4
+    FMIN s1, s1, s3
+    .align 3
+
+.L23: //N<8
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD a0, X, 0 * SIZE
+    LD a1, X, 1 * SIZE
+    addi.d I, I, -1
+    FABS a0, a0
+    FABS a1, a1
+    ADD a0, a0, a1
+    add.d  X, X, INCX
+    FMIN s1, a0, s1
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV $f0, $f22
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/casum_lasx.S
+++ b/kernel/loongarch64/casum_lasx.S
@ -0,0 +1,329 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define res1   $xr16
+#define res2   $xr17
+#define res3   $xr18
+#define res0   $xr19
+#define neg1   $xr20
+#define VT0    $xr21
+#define VT1    $xr22
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    xvreplgr2vr.d neg1, t1
+    xvffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    xvreplgr2vr.w neg1, t1
+    xvffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvfmul.d VX0, neg1, VX2
+    xvfmul.d VX1, neg1, VX3
+    xvfcmp.clt.d VT0, VX2, res0
+    xvfcmp.clt.d VT1, VX3, res0
+    xvbitsel.v VX2, VX2, VX0, VT0
+    xvbitsel.v VX3, VX3, VX1, VT1
+    xvfadd.d res2, VX2, VX3
+    xvfadd.d res1, res1, res2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvfmul.s VX2, neg1, VX0
+    xvfmul.s VX3, neg1, VX1
+    xvfcmp.clt.s VT0, VX0, res0
+    xvfcmp.clt.s VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    xvfmul.d VX2, neg1, VX0
+    xvfmul.d VX3, neg1, VX1
+    xvfcmp.clt.d VT0, VX0, res0
+    xvfcmp.clt.d VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.d res2, VX0, VX1
+    xvfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvfmul.s VX2, neg1, VX0
+    xvfmul.s VX3, neg1, VX1
+    xvfcmp.clt.s VT0, VX0, res0
+    xvfcmp.clt.s VT1, VX1, res0
+    xvbitsel.v VX0, VX0, VX2, VT0
+    xvbitsel.v VX1, VX1, VX3, VT1
+    xvfadd.s res2, VX0, VX1
+    xvfadd.s res1, res2, res1
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+#else
+    xvfadd.s res2, res1, res2
+    xvpickve.w VX1, res1, 1
+    xvpickve.w VX2, res1, 2
+    xvpickve.w VX3, res1, 3
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvpickve.w VX0, res2, 4
+    xvpickve.w VX1, res2, 5
+    xvpickve.w VX2, res2, 6
+    xvpickve.w VX3, res2, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX2, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV  $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/casum_lsx.S
+++ b/kernel/loongarch64/casum_lsx.S
@ -0,0 +1,358 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r15
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+#define res0   $vr19
+#define neg1   $vr20
+#define VT0    $vr21
+#define VT1    $vr22
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res0, res0, res0
+    bge $r0, N, .L999
+    bge $r0, INCX, .L999
+#ifdef DOUBLE
+    li.d t1, -1
+    vreplgr2vr.d neg1, t1
+    vffint.d.l neg1, neg1
+#else
+    li.w t1, -1
+    vreplgr2vr.w neg1, t1
+    vffint.s.w neg1, neg1
+#endif
+    li.d  TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bge $r0, I, .L13
+    .align 3
+
+.L11:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vfmul.d VX0, neg1, VX2
+    vfmul.d VX1, neg1, VX3
+    vfcmp.clt.d VT0, VX2, res0
+    vfcmp.clt.d VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vfmul.d VX0, neg1, VX2
+    vfmul.d VX1, neg1, VX3
+    vfcmp.clt.d VT0, VX2, res0
+    vfcmp.clt.d VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.d res2, VX2, VX3
+    vfadd.d res1, res1, res2
+    addi.d  I, I, -1
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 12 * SIZE
+    addi.d  I, I, -1
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res3, VX1, VX0
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d X, X, 16 * SIZE
+    blt $r0, I, .L11
+    .align 3
+
+.L12:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L13:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L14:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    addi.d  X, X, 2 * SIZE
+    blt $r0, I, .L14
+    b .L999
+    .align 3
+
+.L20:
+    bge $r0, I, .L23
+    .align 3
+
+.L21:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    vinsgr2vr.d VX1, t1, 0
+    vinsgr2vr.d VX1, t2, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t3, 0
+    vinsgr2vr.d VX0, t4, 1
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    add.d X, X, INCX
+    vfmul.d VX2, neg1, VX0
+    vfmul.d VX3, neg1, VX1
+    vfcmp.clt.d VT0, VX0, res0
+    vfcmp.clt.d VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.d res2, VX0, VX1
+    vfadd.d res1, res1, res2
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    vfmul.s VX2, neg1, VX0
+    vfmul.s VX3, neg1, VX1
+    vfcmp.clt.s VT0, VX0, res0
+    vfcmp.clt.s VT1, VX1, res0
+    vbitsel.v VX0, VX0, VX2, VT0
+    vbitsel.v VX1, VX1, VX3, VT1
+    vfadd.s res2, VX0, VX1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vfmul.s VX0, neg1, VX2
+    vfmul.s VX1, neg1, VX3
+    vfcmp.clt.s VT0, VX2, res0
+    vfcmp.clt.s VT1, VX3, res0
+    vbitsel.v VX2, VX2, VX0, VT0
+    vbitsel.v VX3, VX3, VX1, VT1
+    vfadd.s res3, VX2, VX3
+    vfadd.s res2, res3, res2
+    vfadd.s res1, res1, res2
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L21
+    .align 3
+
+.L22:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+#endif
+    .align 3
+
+.L23:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L24:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    FABS a1, a1
+    FABS a2, a2
+    addi.d I, I, -1
+    ADD  a1, a1, a2
+    ADD  s1, a1, s1
+    add.d  X, X, INCX
+    blt $r0, I, .L24
+    .align 3
+
+.L999:
+    MOV  $f0, $f16
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/caxpby_lasx.S
+++ b/kernel/loongarch64/caxpby_lasx.S
--- a/kernel/loongarch64/caxpby_lsx.S
+++ b/kernel/loongarch64/caxpby_lsx.S
--- a/kernel/loongarch64/caxpy_lasx.S
+++ b/kernel/loongarch64/caxpy_lasx.S
@ -0,0 +1,707 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $xr8
+#define VX1    $xr20
+#define VX2    $xr21
+#define VX3    $xr22
+#define VXAR   $xr23
+#define VXAI   $xr19
+#define x1     $xr18
+#define x2     $xr17
+#define x3     $xr16
+#define x4     $xr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L10
+    bcnez $fcc1, .L999
+.L10:
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+    MTG  t2, ALPHAI
+#ifdef DOUBLE
+    xvreplgr2vr.d VXAR, t1
+    xvreplgr2vr.d VXAI, t2
+    srai.d I, N, 2
+#else
+    xvreplgr2vr.w VXAR, t1
+    xvreplgr2vr.w VXAI, t2
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX0, X, 0 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX1, X, 8 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    xvld VX0, X, 0 * SIZE
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    add.d Y, Y, INCY
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+#else
+    xvld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    xvld VX1, X, 8 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    add.d Y, Y, INCY
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    addi.d  I, I, -1
+#else
+    addi.d  I, I, -1
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+    add.d YY, YY, INCY
+    addi.d X, X, 16 * SIZE
+#endif
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    add.d X, X, INCX
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+#else
+    xvld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvld VX3, Y, 8 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    xvilvl.d VX2, x4 ,x3
+    xvilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 4 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvilvl.w VX2, x4 ,x3
+    xvilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    xvst VX2, Y, 0 * SIZE
+    xvst VX3, Y, 8 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmsub.d VX1, VXAR, x1, VX0
+    xvfmadd.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfadd.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmsub.s VX1, VXAR, x1, VX0
+    xvfmadd.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    xvfmul.d VX0, VXAI, x2
+    xvfmul.d VX2, VXAI, x1
+    xvfmadd.d VX1, VXAR, x1, VX0
+    xvfmsub.d VX3, x2, VXAR, VX2
+    xvfadd.d x3, x3, VX1
+    xvfsub.d x4, x4, VX3
+#else
+    xvfmul.s VX0, VXAI, x2
+    xvfmul.s VX2, VXAI, x1
+    xvfmadd.s VX1, VXAR, x1, VX0
+    xvfmsub.s VX3, x2, VXAR, VX2
+    xvfadd.s x3, x3, VX1
+    xvfsub.s x4, x4, VX3
+#endif
+#endif
+    addi.d  I, I, -1
+#ifdef DOUBLE
+    xvstelm.d x3, YY, 0 * SIZE, 0
+    xvstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 1
+    xvstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 2
+    xvstelm.d x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.d x3, YY, 0 * SIZE, 3
+    xvstelm.d x4, YY, 1 * SIZE, 3
+#else
+    xvstelm.w x3, YY, 0 * SIZE, 0
+    xvstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 1
+    xvstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 2
+    xvstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 3
+    xvstelm.w x4, YY, 1 * SIZE, 3
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 4
+    xvstelm.w x4, YY, 1 * SIZE, 4
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 5
+    xvstelm.w x4, YY, 1 * SIZE, 5
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 6
+    xvstelm.w x4, YY, 1 * SIZE, 6
+    add.d YY, YY, INCY
+    xvstelm.w x3, YY, 0 * SIZE, 7
+    xvstelm.w x4, YY, 1 * SIZE, 7
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    LD  a3, Y, 0 * SIZE
+    LD  a4, Y, 1 * SIZE
+    addi.d I, I, -1
+#if !defined(CONJ)
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s3, ALPHAR, a1, s1
+    MADD s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    ADD  s4, s4, a4
+#else
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MADD s3, ALPHAR, a1, s1
+    MSUB s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    SUB  s4, a4, s4
+#endif
+    ST  s3, Y, 0 * SIZE
+    ST  s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/caxpy_lsx.S
+++ b/kernel/loongarch64/caxpy_lsx.S
@ -0,0 +1,679 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define XX     $r5
+#define YY     $r6
+#define ALPHAR $f0
+#define ALPHAI $f1
+#define X      $r7
+#define INCX   $r8
+#define Y      $r9
+#define INCY   $r10
+
+#define I      $r12
+#define TEMP   $r13
+#define t1     $r14
+#define t2     $r16
+#define t3     $r15
+#define t4     $r17
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define VX0    $vr8
+#define VX1    $vr20
+#define VX2    $vr21
+#define VX3    $vr22
+#define VXAR   $vr23
+#define VXAI   $vr19
+#define x1     $vr18
+#define x2     $vr17
+#define x3     $vr16
+#define x4     $vr15
+
+    PROLOGUE
+
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    movgr2fr.d a1, $r0
+    FFINT  a1, a1
+    CMPEQ  $fcc0, ALPHAR, a1
+    CMPEQ  $fcc1, ALPHAI, a1
+    bceqz $fcc0, .L10
+    bcnez $fcc1, .L999
+.L10:
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    MTG  t1, ALPHAR
+    MTG  t2, ALPHAI
+#ifdef DOUBLE
+    vreplgr2vr.d VXAR, t1
+    vreplgr2vr.d VXAI, t2
+#else
+    vreplgr2vr.w VXAR, t1
+    vreplgr2vr.w VXAI, t2
+#endif
+    srai.d I, N, 2
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#else
+    vld VX0, X, 0 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX1, X, 4 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+    vld VX0, X, 4 * SIZE
+    vld VX2, Y, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    vld VX3, Y, 6 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L997
+    .align 3
+
+.L12: // INCX==1 and INCY!=1
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L121:
+#ifdef DOUBLE
+    vld VX0, X, 0 * SIZE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#else
+    vld VX0, X, 0 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    add.d Y, Y, INCY
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+
+    vld VX0, X, 4 * SIZE
+    vld VX1, X, 6 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+#else
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+#endif
+    add.d YY, YY, INCY
+    addi.d X, X, 8 * SIZE
+    blt $r0, I, .L121
+    b .L997
+    .align 3
+
+.L21:// INCX!=1 and INCY==1
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#else
+    vld VX2, Y, 0 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    add.d X, X, INCX
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+#endif
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 2 * SIZE
+
+    vld VX2, Y, 4 * SIZE
+    vld VX3, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    add.d X, X, INCX
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    vilvl.d VX2, x4 ,x3
+    vilvh.d VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+#else
+    vilvl.w VX2, x4 ,x3
+    vilvh.w VX3, x4, x3
+    addi.d  I, I, -1
+    vst VX2, Y, 0 * SIZE
+    vst VX3, Y, 4 * SIZE
+#endif
+    addi.d Y, Y, 8 * SIZE
+    blt $r0, I, .L211
+    b .L997
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    move YY, Y
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+#endif
+    add.d Y, Y, INCY
+#if !defined(CONJ)
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmsub.s VX1, VXAR, x1, VX0
+    vfmadd.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfadd.s x4, x4, VX3
+#endif
+#else
+#ifdef DOUBLE
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#else
+    vfmul.s VX0, VXAI, x2
+    vfmul.s VX2, VXAI, x1
+    vfmadd.s VX1, VXAR, x1, VX0
+    vfmsub.s VX3, x2, VXAR, VX2
+    vfadd.s x3, x3, VX1
+    vfsub.s x4, x4, VX3
+#endif
+#endif
+#ifdef DOUBLE
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    add.d Y, Y, INCY
+#if !defined(CONJ)
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmsub.d VX1, VXAR, x1, VX0
+    vfmadd.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfadd.d x4, x4, VX3
+#else
+    vfmul.d VX0, VXAI, x2
+    vfmul.d VX2, VXAI, x1
+    vfmadd.d VX1, VXAR, x1, VX0
+    vfmsub.d VX3, x2, VXAR, VX2
+    vfadd.d x3, x3, VX1
+    vfsub.d x4, x4, VX3
+#endif
+    addi.d  I, I, -1
+    vstelm.d x3, YY, 0 * SIZE, 0
+    vstelm.d x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.d x3, YY, 0 * SIZE, 1
+    vstelm.d x4, YY, 1 * SIZE, 1
+#else
+    addi.d  I, I, -1
+    vstelm.w x3, YY, 0 * SIZE, 0
+    vstelm.w x4, YY, 1 * SIZE, 0
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 1
+    vstelm.w x4, YY, 1 * SIZE, 1
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 2
+    vstelm.w x4, YY, 1 * SIZE, 2
+    add.d YY, YY, INCY
+    vstelm.w x3, YY, 0 * SIZE, 3
+    vstelm.w x4, YY, 1 * SIZE, 3
+#endif
+    add.d YY, YY, INCY
+    blt $r0, I, .L222
+    .align 3
+
+.L997:
+    andi I, N, 3
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    LD  a3, Y, 0 * SIZE
+    LD  a4, Y, 1 * SIZE
+    addi.d I, I, -1
+#if !defined(CONJ)
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MSUB s3, ALPHAR, a1, s1
+    MADD s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    ADD  s4, s4, a4
+#else
+    MUL  s1, ALPHAI, a2
+    MUL  s2, ALPHAI, a1
+    MADD s3, ALPHAR, a1, s1
+    MSUB s4, a2, ALPHAR, s2
+    ADD  s3, s3, a3
+    SUB  s4, a4, s4
+#endif
+    ST  s3, Y, 0 * SIZE
+    ST  s4, Y, 1 * SIZE
+    add.d X, X, INCX
+    add.d Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/ccopy_lasx.S
+++ b/kernel/loongarch64/ccopy_lasx.S
@ -0,0 +1,386 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    xvst VX2, Y, 8 * SIZE
+    xvst VX3, Y, 12 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, X, 8 * SIZE
+    xvld VX3, X, 12 * SIZE
+    xvstelm.d VX0, Y, 0 * SIZE, 0
+    xvstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX0, Y, 0 * SIZE, 2
+    xvstelm.d VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0 * SIZE, 0
+    xvstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX1, Y, 0 * SIZE, 2
+    xvstelm.d VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX2, Y, 0 * SIZE, 0
+    xvstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX2, Y, 0 * SIZE, 2
+    xvstelm.d VX2, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.d VX3, Y, 0 * SIZE, 0
+    xvstelm.d VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.d VX3, Y, 0 * SIZE, 2
+    xvstelm.d VX3, Y, 1 * SIZE, 3
+#else
+    xvld VX1, X, 8 * SIZE
+    xvstelm.w VX0, Y, 0 * SIZE, 0
+    xvstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 2
+    xvstelm.w VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 4
+    xvstelm.w VX0, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX0, Y, 0 * SIZE, 6
+    xvstelm.w VX0, Y, 1 * SIZE, 7
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 0
+    xvstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 2
+    xvstelm.w VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 4
+    xvstelm.w VX1, Y, 1 * SIZE, 5
+    add.d Y, Y, INCY
+    xvstelm.w VX1, Y, 0 * SIZE, 6
+    xvstelm.w VX1, Y, 1 * SIZE, 7
+#endif
+    add.d Y, Y, INCY
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX0, t1, 0
+    xvinsgr2vr.d VX0, t2, 1
+    xvinsgr2vr.d VX0, t3, 2
+    xvinsgr2vr.d VX0, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX1, t1, 0
+    xvinsgr2vr.d VX1, t2, 1
+    xvinsgr2vr.d VX1, t3, 2
+    xvinsgr2vr.d VX1, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX2, t1, 0
+    xvinsgr2vr.d VX2, t2, 1
+    xvinsgr2vr.d VX2, t3, 2
+    xvinsgr2vr.d VX2, t4, 3
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d VX3, t1, 0
+    xvinsgr2vr.d VX3, t2, 1
+    xvinsgr2vr.d VX3, t3, 2
+    xvinsgr2vr.d VX3, t4, 3
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 4 * SIZE
+    xvst VX2, Y, 8 * SIZE
+    xvst VX3, Y, 12 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 0
+    xvinsgr2vr.w VX0, t2, 1
+    xvinsgr2vr.w VX0, t3, 2
+    xvinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX0, t1, 4
+    xvinsgr2vr.w VX0, t2, 5
+    xvinsgr2vr.w VX0, t3, 6
+    xvinsgr2vr.w VX0, t4, 7
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 0
+    xvinsgr2vr.w VX1, t2, 1
+    xvinsgr2vr.w VX1, t3, 2
+    xvinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w VX1, t1, 4
+    xvinsgr2vr.w VX1, t2, 5
+    xvinsgr2vr.w VX1, t3, 6
+    xvinsgr2vr.w VX1, t4, 7
+    xvst VX0, Y, 0 * SIZE
+    xvst VX1, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/ccopy_lsx.S
+++ b/kernel/loongarch64/ccopy_lsx.S
@ -0,0 +1,411 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r17
+#define TEMP   $r18
+#define t1     $r14
+#define t2     $r15
+#define t3     $r16
+#define t4     $r19
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+
+    PROLOGUE
+    bge $r0, N, .L999
+    li.d TEMP, 1
+    slli.d  TEMP, TEMP, ZBASE_SHIFT
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+    srai.d I, N, 3
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:// INCX==1 and INCY==1
+    bge $r0, I, .L112
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    vst VX2, Y, 4 * SIZE
+    vst VX3, Y, 6 * SIZE
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 8 * SIZE
+    vst VX1, Y, 10 * SIZE
+    vst VX2, Y, 12 * SIZE
+    vst VX3, Y, 14 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    addi.d  I, I, -1
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    vst VX2, Y, 8 * SIZE
+    vst VX3, Y, 12 * SIZE
+#endif
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+    blt $r0, I, .L111
+    .align 3
+
+.L112:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L113:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    addi.d Y, Y, 2 * SIZE
+    blt $r0, I, .L113
+    b .L999
+    .align 3
+
+.L12:   // INCX==1 and INCY!=1
+    bge $r0, I, .L122
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, X, 4 * SIZE
+    vld VX3, X, 6 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX2, Y, 0 * SIZE, 0
+    vstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX3, Y, 0 * SIZE, 0
+    vstelm.d VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vld VX0, X, 8 * SIZE
+    vld VX1, X, 10 * SIZE
+    vld VX2, X, 12 * SIZE
+    vld VX3, X, 14 * SIZE
+    vstelm.d VX0, Y, 0 * SIZE, 0
+    vstelm.d VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX1, Y, 0 * SIZE, 0
+    vstelm.d VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX2, Y, 0 * SIZE, 0
+    vstelm.d VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.d VX3, Y, 0 * SIZE, 0
+    vstelm.d VX3, Y, 1 * SIZE, 1
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, X, 8 * SIZE
+    vld VX3, X, 12 * SIZE
+    vstelm.w VX0, Y, 0 * SIZE, 0
+    vstelm.w VX0, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX0, Y, 0 * SIZE, 2
+    vstelm.w VX0, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0 * SIZE, 0
+    vstelm.w VX1, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX1, Y, 0 * SIZE, 2
+    vstelm.w VX1, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX2, Y, 0 * SIZE, 0
+    vstelm.w VX2, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX2, Y, 0 * SIZE, 2
+    vstelm.w VX2, Y, 1 * SIZE, 3
+    add.d Y, Y, INCY
+    vstelm.w VX3, Y, 0 * SIZE, 0
+    vstelm.w VX3, Y, 1 * SIZE, 1
+    add.d Y, Y, INCY
+    vstelm.w VX3, Y, 0 * SIZE, 2
+    vstelm.w VX3, Y, 1 * SIZE, 3
+#endif
+    add.d Y, Y, INCY
+    addi.d X, X, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    .align 3
+
+.L122:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L123:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    addi.d  X, X, 2 * SIZE
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  Y, Y, INCY
+    blt $r0, I, .L123
+    b .L999
+    .align 3
+
+.L21:
+    bge $r0, I, .L212
+    .align 3
+
+.L211:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 4 * SIZE
+    vst VX1, Y, 6 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 8 * SIZE
+    vst VX1, Y, 10 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d VX0, t1, 0
+    vinsgr2vr.d VX0, t2, 1
+    vinsgr2vr.d VX1, t3, 0
+    vinsgr2vr.d VX1, t4, 1
+    vst VX0, Y, 12 * SIZE
+    vst VX1, Y, 14 * SIZE
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX0, t1, 0
+    vinsgr2vr.w VX0, t2, 1
+    vinsgr2vr.w VX0, t3, 2
+    vinsgr2vr.w VX0, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX1, t1, 0
+    vinsgr2vr.w VX1, t2, 1
+    vinsgr2vr.w VX1, t3, 2
+    vinsgr2vr.w VX1, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX2, t1, 0
+    vinsgr2vr.w VX2, t2, 1
+    vinsgr2vr.w VX2, t3, 2
+    vinsgr2vr.w VX2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w VX3, t1, 0
+    vinsgr2vr.w VX3, t2, 1
+    vinsgr2vr.w VX3, t3, 2
+    vinsgr2vr.w VX3, t4, 3
+    vst VX0, Y, 0 * SIZE
+    vst VX1, Y, 4 * SIZE
+    vst VX2, Y, 8 * SIZE
+    vst VX3, Y, 12 * SIZE
+#endif
+    addi.d Y, Y, 16 * SIZE
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    .align 3
+
+.L212:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L213:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    addi.d  Y, Y, 2 * SIZE
+    blt $r0, I, .L213
+    b .L999
+    .align 3
+
+.L22:
+    bge $r0, I, .L223
+    .align 3
+
+.L222:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    add.d X, X, INCX
+    LD  a3, X, 0 * SIZE
+    LD  a4, X, 1 * SIZE
+    add.d X, X, INCX
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ST  a3, Y, 0 * SIZE
+    ST  a4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L223:
+    andi I, N, 7
+    bge $r0, I, .L999
+    .align 3
+
+.L224:
+    LD  a1, X, 0 * SIZE
+    LD  a2, X, 1 * SIZE
+    addi.d I, I, -1
+    ST  a1, Y, 0 * SIZE
+    ST  a2, Y, 1 * SIZE
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L224
+    .align 3
+
+.L999:
+    move $r4, $r12
+    jirl $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/cdot_lasx.S
+++ b/kernel/loongarch64/cdot_lasx.S
@ -0,0 +1,565 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r19
+#define TEMP   $r10
+#define t1     $r11
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define res1   $xr16
+#define res2   $xr17
+#define res3   $xr18
+#define res4   $xr19
+#define VX0    $xr12
+#define VX1    $xr13
+#define VX2    $xr14
+#define VX3    $xr15
+#define x1     $xr20
+#define x2     $xr21
+#define x3     $xr22
+#define x4     $xr23
+
+    PROLOGUE
+    xvxor.v res1, res1, res1
+    xvxor.v res2, res2, res2
+    xvxor.v res3, res3, res3
+    xvxor.v res4, res4, res4
+    bge $r0, N, .L999
+    li.d  TEMP, 2 * SIZE
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 2
+#else
+    srai.d I, N, 3
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    xvld VX1, X, 4 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 4 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#else
+    xvld VX1, X, 8 * SIZE
+    xvld VX2, Y, 0 * SIZE
+    xvld VX3, Y, 8 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+    addi.d X, X, 16 * SIZE
+    addi.d Y, Y, 16 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L996
+    .align 3
+
+.L12:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    xvld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 2
+    xvinsgr2vr.d x4, t4, 2
+    xvld VX1, X, 4 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 1
+    xvinsgr2vr.d x4, t2, 1
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    addi.d X, X, 8 * SIZE
+    xvpickev.d x1, VX1, VX0
+    xvpickod.d x2, VX1, VX0
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    xvld VX1, X, 8 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    addi.d X, X, 16 * SIZE
+    xvpickev.w x1, VX1, VX0
+    xvpickod.w x2, VX1, VX0
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L21:
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+    xvld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 2
+    xvinsgr2vr.d x2, t4, 2
+    xvld VX3, Y, 4 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 1
+    xvinsgr2vr.d x2, t2, 1
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    addi.d Y, Y, 8 * SIZE
+    xvpickev.d x3, VX3, VX2
+    xvpickod.d x4, VX3, VX2
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    xvld VX3, Y, 8 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    addi.d Y, Y, 8 * SIZE
+    xvpickev.w x3, VX3, VX2
+    xvpickod.w x4, VX3, VX2
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L996
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 0
+    xvinsgr2vr.d x2, t2, 0
+    xvinsgr2vr.d x1, t3, 1
+    xvinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 0
+    xvinsgr2vr.d x4, t2, 0
+    xvinsgr2vr.d x3, t3, 1
+    xvinsgr2vr.d x4, t4, 1
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.d x1, t1, 2
+    xvinsgr2vr.d x2, t2, 2
+    xvinsgr2vr.d x1, t3, 3
+    xvinsgr2vr.d x2, t4, 3
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.d x3, t1, 2
+    xvinsgr2vr.d x4, t2, 2
+    xvinsgr2vr.d x3, t3, 3
+    xvinsgr2vr.d x4, t4, 3
+    xvfmadd.d res1, x1, x3, res1
+    xvfmadd.d res2, x2, x3, res2
+    xvfmadd.d res3, x1, x4, res3
+    xvfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 0
+    xvinsgr2vr.w x2, t2, 0
+    xvinsgr2vr.w x1, t3, 1
+    xvinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 0
+    xvinsgr2vr.w x4, t2, 0
+    xvinsgr2vr.w x3, t3, 1
+    xvinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 2
+    xvinsgr2vr.w x2, t2, 2
+    xvinsgr2vr.w x1, t3, 3
+    xvinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 2
+    xvinsgr2vr.w x4, t2, 2
+    xvinsgr2vr.w x3, t3, 3
+    xvinsgr2vr.w x4, t4, 3
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 4
+    xvinsgr2vr.w x2, t2, 4
+    xvinsgr2vr.w x1, t3, 5
+    xvinsgr2vr.w x2, t4, 5
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 4
+    xvinsgr2vr.w x4, t2, 4
+    xvinsgr2vr.w x3, t3, 5
+    xvinsgr2vr.w x4, t4, 5
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    xvinsgr2vr.w x1, t1, 6
+    xvinsgr2vr.w x2, t2, 6
+    xvinsgr2vr.w x1, t3, 7
+    xvinsgr2vr.w x2, t4, 7
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    xvinsgr2vr.w x3, t1, 6
+    xvinsgr2vr.w x4, t2, 6
+    xvinsgr2vr.w x3, t3, 7
+    xvinsgr2vr.w x4, t4, 7
+    xvfmadd.s res1, x1, x3, res1
+    xvfmadd.s res2, x2, x3, res2
+    xvfmadd.s res3, x1, x4, res3
+    xvfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L996:
+#ifdef DOUBLE
+    xvpickve.d VX1, res1, 1
+    xvpickve.d VX2, res1, 2
+    xvpickve.d VX3, res1, 3
+    xvfadd.d res1, VX1, res1
+    xvfadd.d res1, VX2, res1
+    xvfadd.d res1, VX3, res1
+    xvpickve.d VX1, res2, 1
+    xvpickve.d VX2, res2, 2
+    xvpickve.d VX3, res2, 3
+    xvfadd.d res2, VX1, res2
+    xvfadd.d res2, VX2, res2
+    xvfadd.d res2, VX3, res2
+    xvpickve.d VX1, res3, 1
+    xvpickve.d VX2, res3, 2
+    xvpickve.d VX3, res3, 3
+    xvfadd.d res3, VX1, res3
+    xvfadd.d res3, VX2, res3
+    xvfadd.d res3, VX3, res3
+    xvpickve.d VX1, res4, 1
+    xvpickve.d VX2, res4, 2
+    xvpickve.d VX3, res4, 3
+    xvfadd.d res4, VX1, res4
+    xvfadd.d res4, VX2, res4
+    xvfadd.d res4, VX3, res4
+#else
+    xvpickve.w VX0, res1, 1
+    xvpickve.w VX1, res1, 2
+    xvpickve.w VX2, res1, 3
+    xvpickve.w VX3, res1, 4
+    xvpickve.w x1, res1, 5
+    xvpickve.w x2, res1, 6
+    xvpickve.w x3, res1, 7
+    xvfadd.s res1, VX0, res1
+    xvfadd.s res1, VX1, res1
+    xvfadd.s res1, VX2, res1
+    xvfadd.s res1, VX3, res1
+    xvfadd.s res1, x1, res1
+    xvfadd.s res1, x2, res1
+    xvfadd.s res1, x3, res1
+    xvpickve.w VX0, res2, 1
+    xvpickve.w VX1, res2, 2
+    xvpickve.w VX2, res2, 3
+    xvpickve.w VX3, res2, 4
+    xvpickve.w x1, res2, 5
+    xvpickve.w x2, res2, 6
+    xvpickve.w x3, res2, 7
+    xvfadd.s res2, VX0, res2
+    xvfadd.s res2, VX1, res2
+    xvfadd.s res2, VX2, res2
+    xvfadd.s res2, VX3, res2
+    xvfadd.s res2, x1, res2
+    xvfadd.s res2, x2, res2
+    xvfadd.s res2, x3, res2
+    xvpickve.w VX0, res3, 1
+    xvpickve.w VX1, res3, 2
+    xvpickve.w VX2, res3, 3
+    xvpickve.w VX3, res3, 4
+    xvpickve.w x1, res3, 5
+    xvpickve.w x2, res3, 6
+    xvpickve.w x3, res3, 7
+    xvfadd.s res3, VX0, res3
+    xvfadd.s res3, VX1, res3
+    xvfadd.s res3, VX2, res3
+    xvfadd.s res3, VX3, res3
+    xvfadd.s res3, x1, res3
+    xvfadd.s res3, x2, res3
+    xvfadd.s res3, x3, res3
+    xvpickve.w VX0, res4, 1
+    xvpickve.w VX1, res4, 2
+    xvpickve.w VX2, res4, 3
+    xvpickve.w VX3, res4, 4
+    xvpickve.w x1, res4, 5
+    xvpickve.w x2, res4, 6
+    xvpickve.w x3, res4, 7
+    xvfadd.s res4, VX0, res4
+    xvfadd.s res4, VX1, res4
+    xvfadd.s res4, VX2, res4
+    xvfadd.s res4, VX3, res4
+    xvfadd.s res4, x1, res4
+    xvfadd.s res4, x2, res4
+    xvfadd.s res4, x3, res4
+#endif
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 3
+#else
+    andi I, N, 7
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MADD s1, a1, a3, s1
+    MADD s2, a2, a3, s2
+    MADD s3, a1, a4, s3
+    MADD s4, a2, a4, s4
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+#ifndef CONJ
+    SUB $f0, s1, s4
+    ADD $f1, s3, s2
+#else
+    ADD $f0, s1, s4
+    SUB $f1, s3, s2
+#endif
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/cdot_lsx.S
+++ b/kernel/loongarch64/cdot_lsx.S
@ -0,0 +1,397 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+#define I      $r19
+#define TEMP   $r10
+#define t1     $r11
+#define t2     $r12
+#define t3     $r13
+#define t4     $r14
+#define a1     $f12
+#define a2     $f13
+#define a3     $f14
+#define a4     $f15
+#define s1     $f16
+#define s2     $f17
+#define s3     $f18
+#define s4     $f19
+#define res1   $vr16
+#define res2   $vr17
+#define res3   $vr18
+#define res4   $vr19
+#define VX0    $vr12
+#define VX1    $vr13
+#define VX2    $vr14
+#define VX3    $vr15
+#define x1     $vr20
+#define x2     $vr21
+#define x3     $vr22
+#define x4     $vr23
+
+    PROLOGUE
+    vxor.v res1, res1, res1
+    vxor.v res2, res2, res2
+    vxor.v res3, res3, res3
+    vxor.v res4, res4, res4
+    bge $r0, N, .L999
+    li.d  TEMP, 2 * SIZE
+    slli.d  INCX, INCX, ZBASE_SHIFT
+    slli.d  INCY, INCY, ZBASE_SHIFT
+#ifdef DOUBLE
+    srai.d I, N, 1
+#else
+    srai.d I, N, 2
+#endif
+    bne INCX, TEMP, .L20
+    bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
+    b .L11  // INCX==1 and INCY==1
+.L20:
+    bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
+    b .L21 // INCX!=1 and INCY==1
+
+.L11:
+    bge $r0, I, .L997
+    .align 3
+
+.L111:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 2 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+    addi.d X, X, 4 * SIZE
+    addi.d Y, Y, 4 * SIZE
+#else
+    vld VX1, X, 4 * SIZE
+    vld VX2, Y, 0 * SIZE
+    vld VX3, Y, 4 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+    addi.d X, X, 8 * SIZE
+    addi.d Y, Y, 8 * SIZE
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L111
+    b .L996
+    .align 3
+
+.L12:
+    bge $r0, I, .L997
+    .align 3
+
+.L121:
+    vld VX0, X, 0 * SIZE
+#ifdef DOUBLE
+    vld VX1, X, 2 * SIZE
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    addi.d X, X, 4 * SIZE
+    vpickev.d x1, VX1, VX0
+    vpickod.d x2, VX1, VX0
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    vld VX1, X, 4 * SIZE
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    addi.d X, X, 8 * SIZE
+    vpickev.w x1, VX1, VX0
+    vpickod.w x2, VX1, VX0
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L121
+    b .L996
+    .align 3
+
+.L21:
+    bge $r0, I, .L997
+    .align 3
+
+.L211:
+    vld VX2, Y, 0 * SIZE
+#ifdef DOUBLE
+    vld VX3, Y, 2 * SIZE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    addi.d Y, Y, 4 * SIZE
+    vpickev.d x3, VX3, VX2
+    vpickod.d x4, VX3, VX2
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    vld VX3, Y, 4 * SIZE
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    addi.d Y, Y, 8 * SIZE
+    vpickev.w x3, VX3, VX2
+    vpickod.w x4, VX3, VX2
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L211
+    b .L996
+    .align 3
+
+.L22:
+    bge $r0, I, .L997
+    .align 3
+
+.L222:
+#ifdef DOUBLE
+    ld.d t1, X, 0 * SIZE
+    ld.d t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.d t3, X, 0 * SIZE
+    ld.d t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.d x1, t1, 0
+    vinsgr2vr.d x2, t2, 0
+    vinsgr2vr.d x1, t3, 1
+    vinsgr2vr.d x2, t4, 1
+    ld.d t1, Y, 0 * SIZE
+    ld.d t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.d t3, Y, 0 * SIZE
+    ld.d t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.d x3, t1, 0
+    vinsgr2vr.d x4, t2, 0
+    vinsgr2vr.d x3, t3, 1
+    vinsgr2vr.d x4, t4, 1
+    vfmadd.d res1, x1, x3, res1
+    vfmadd.d res2, x2, x3, res2
+    vfmadd.d res3, x1, x4, res3
+    vfmadd.d res4, x2, x4, res4
+#else
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 0
+    vinsgr2vr.w x2, t2, 0
+    vinsgr2vr.w x1, t3, 1
+    vinsgr2vr.w x2, t4, 1
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 0
+    vinsgr2vr.w x4, t2, 0
+    vinsgr2vr.w x3, t3, 1
+    vinsgr2vr.w x4, t4, 1
+    ld.w t1, X, 0 * SIZE
+    ld.w t2, X, 1 * SIZE
+    add.d X, X, INCX
+    ld.w t3, X, 0 * SIZE
+    ld.w t4, X, 1 * SIZE
+    add.d X, X, INCX
+    vinsgr2vr.w x1, t1, 2
+    vinsgr2vr.w x2, t2, 2
+    vinsgr2vr.w x1, t3, 3
+    vinsgr2vr.w x2, t4, 3
+    ld.w t1, Y, 0 * SIZE
+    ld.w t2, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    ld.w t3, Y, 0 * SIZE
+    ld.w t4, Y, 1 * SIZE
+    add.d Y, Y, INCY
+    vinsgr2vr.w x3, t1, 2
+    vinsgr2vr.w x4, t2, 2
+    vinsgr2vr.w x3, t3, 3
+    vinsgr2vr.w x4, t4, 3
+    vfmadd.s res1, x1, x3, res1
+    vfmadd.s res2, x2, x3, res2
+    vfmadd.s res3, x1, x4, res3
+    vfmadd.s res4, x2, x4, res4
+#endif
+    addi.d  I, I, -1
+    blt $r0, I, .L222
+    .align 3
+
+.L996:
+#ifdef DOUBLE
+    vreplvei.d VX1, res1, 1
+    vfadd.d res1, VX1, res1
+    vreplvei.d VX1, res2, 1
+    vfadd.d res2, VX1, res2
+    vreplvei.d VX1, res3, 1
+    vfadd.d res3, VX1, res3
+    vreplvei.d VX1, res4, 1
+    vfadd.d res4, VX1, res4
+#else
+    vreplvei.w VX1, res1, 1
+    vreplvei.w VX2, res1, 2
+    vreplvei.w VX3, res1, 3
+    vfadd.s res1, VX1, res1
+    vfadd.s res1, VX2, res1
+    vfadd.s res1, VX3, res1
+    vreplvei.w VX1, res2, 1
+    vreplvei.w VX2, res2, 2
+    vreplvei.w VX3, res2, 3
+    vfadd.s res2, VX1, res2
+    vfadd.s res2, VX2, res2
+    vfadd.s res2, VX3, res2
+    vreplvei.w VX1, res3, 1
+    vreplvei.w VX2, res3, 2
+    vreplvei.w VX3, res3, 3
+    vfadd.s res3, VX1, res3
+    vfadd.s res3, VX2, res3
+    vfadd.s res3, VX3, res3
+    vreplvei.w VX1, res4, 1
+    vreplvei.w VX2, res4, 2
+    vreplvei.w VX3, res4, 3
+    vfadd.s res4, VX1, res4
+    vfadd.s res4, VX2, res4
+    vfadd.s res4, VX3, res4
+#endif
+    .align 3
+
+.L997:
+#ifdef DOUBLE
+    andi I, N, 1
+#else
+    andi I, N, 3
+#endif
+    bge $r0, I, .L999
+    .align 3
+
+.L998:
+    LD   a1, X, 0 * SIZE
+    LD   a2, X, 1 * SIZE
+    LD   a3, Y, 0 * SIZE
+    LD   a4, Y, 1 * SIZE
+    MADD s1, a1, a3, s1
+    MADD s2, a2, a3, s2
+    MADD s3, a1, a4, s3
+    MADD s4, a2, a4, s4
+    addi.d I, I, -1
+    add.d  X, X, INCX
+    add.d  Y, Y, INCY
+    blt $r0, I, .L998
+    .align 3
+
+.L999:
+#ifndef CONJ
+    SUB $f0, s1, s4
+    ADD $f1, s3, s2
+#else
+    ADD $f0, s1, s4
+    SUB $f1, s3, s2
+#endif
+    jirl    $r0, $r1, 0x0
+    .align 3
+
+    EPILOGUE
--- a/kernel/loongarch64/cgemm_kernel_2x2_lasx.S
+++ b/kernel/loongarch64/cgemm_kernel_2x2_lasx.S
@ -0,0 +1,857 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $xr30
+#define U1     $xr31
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define U8     $xr8
+#define U9     $xr9
+#define U10    $xr10
+#define U11    $xr11
+#define U12    $xr12
+#define U13    $xr13
+#define U14    $xr14
+#define U15    $xr15
+#define D0     $xr16
+#define D1     $xr17
+#define D2     $xr18
+#define D3     $xr19
+#define D4     $xr20
+#define D5     $xr21
+#define D6     $xr22
+#define D7     $xr23
+#define D8     $xr24
+#define D9     $xr25
+#define D10    $xr26
+#define D11    $xr27
+#define VALPHAR    $xr28
+#define VALPHAI    $xr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFADD
+#define    XVFADD3       XVFSUB
+#define    XVFADD4       XVFADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVFMADD
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFADD
+#define    XVFADD3       XVFADD
+#define    XVFADD4       XVFSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVFMADD
+#define    XVMADD4       XVFMADD
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFSUB
+#define    XVFADD3       XVFADD
+#define    XVFADD4       XVFADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    XVMADD1       XVFMADD
+#define    XVMADD2       XVNMSUB
+#define    XVMADD3       XVNMSUB
+#define    XVMADD4       XVNMSUB
+
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    XVFADD1       XVFADD
+#define    XVFADD2       XVFSUB
+#define    XVFADD3       XVFSUB
+#define    XVFADD4       XVFSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    xvldrepl.w  VALPHAR, $sp, 112
+    xvldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    xvxor.v    U0,     U0,   U0
+    xvxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    xvld       D0,     A0,    0x00  //a 0-7
+    xvld       D1,     A0,    0x20  //a 8-15
+    xvld       D2,     B0,    0x00  //b 0-7
+    xvld       D3,     B0,    0x20  //b 8-15
+
+    xvand.v    D4,     D0,    D0
+    xvpermi.q  D4,     D1,    0x02  //a 0 1 2 3 8 9 10 11
+    xvand.v    D5,     D4,    D4
+    xvshuf4i.w D4,     D4,    0x88  //a 0 2 0 2 8 10 8 10
+    xvshuf4i.w D5,     D5,    0xdd  //a 1 3 1 3 9 11 9 11
+
+    xvand.v    D6,     D1,    D1
+    xvpermi.q  D6,     D0,    0x31  //a 4 5 6 7 12 13 14 15
+    xvand.v    D7,     D6,    D6
+    xvshuf4i.w D6,     D6,    0x88  //a 4 6 4 6 12 14 12 14
+    xvshuf4i.w D7,     D7,    0xdd  //a 5 7 5 7 13 15 13 15
+
+    xvand.v    D8,     D2,    D2
+    xvpermi.q  D8,     D3,    0x02  //b 0 1 2 3 8 9 10 11
+    xvand.v    D9,     D8,    D8
+    xvshuf4i.w D8,     D8,    0xa0  //b 0 0 2 2 8 8 10 10
+    xvshuf4i.w D9,     D9,    0xf5  //a 1 1 3 3 9 9 11 11
+
+    xvand.v    D10,     D3,    D3
+    xvpermi.q  D10,     D2,    0x31  //b 4 5 6 7 12 13 14 15
+    xvand.v    D11,     D10,    D10
+    xvshuf4i.w D10,     D10,    0xa0  //b 4 4 6 6 12 12 14 14
+    xvshuf4i.w D11,     D11,    0xf5  //a 5 5 7 7 13 13 15 15
+
+    XVMADD1      U0,     D4,    D8,     U0  //res0 2 4 6 0 2 4 6
+    XVMADD2      U1,     D5,    D8,     U1  //res1 3 4 7 1 3 4 7
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD1      U0,     D4,    D8,     U0
+    XVMADD2      U1,     D5,    D8,     U1
+
+    XVMADD3      U0,     D5,    D9,     U0
+    XVMADD4      U1,     D4,    D9,     U1
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD3      U0,     D5,    D9,     U0
+    XVMADD4      U1,     D4,    D9,     U1
+
+    XVMADD1      U0,     D6,    D10,     U0  //res0 2 4 6 0 2 4 6
+    XVMADD2      U1,     D7,    D10,     U1  //res1 3 4 7 1 3 4 7
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD1      U0,     D6,    D10,     U0
+    XVMADD2      U1,     D7,    D10,     U1
+
+    XVMADD3      U0,     D7,    D11,     U0
+    XVMADD4      U1,     D6,    D11,     U1
+
+    xvpermi.q    U0,     U0,    0x01
+    xvpermi.q    U1,     U1,    0x01
+    XVMADD3      U0,     D7,    D11,     U0
+    XVMADD4      U1,     D6,    D11,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    vld       $vr16,     A0,    0x00  //a0 a1 a2 a3
+    vld       $vr17,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w $vr20,     $vr17,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w $vr21,     $vr17,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  $vr18,     $vr16,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  $vr19,     $vr16,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      $vr30,     $vr18,    $vr20,     $vr30  //res0 2 4 6
+    VMADD2      $vr31,     $vr19,    $vr20,     $vr31  //res1 3 5 7
+    VMADD3      $vr30,     $vr19,    $vr21,     $vr30
+    VMADD4      $vr31,     $vr18,    $vr21,     $vr31
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    vld       $vr8,     C0,    0x00  //0 1 2 3
+    vld       $vr9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w $vr10,    $vr9,    $vr8    //0 4 2 6
+    vpermi.w  $vr10,    $vr10,   0xd8    //0 2 4 6
+
+    vpackod.w $vr11,    $vr9,    $vr8    //1 5 3 7
+    vpermi.w  $vr11,    $vr11,   0xd8    //1 3 5 7
+
+    vfmul.s      $vr10,    $vr30,    $vr28
+    vfmul.s      $vr11,    $vr31,    $vr28
+    VNMSUB    $vr10,    $vr31,    $vr29, $vr10
+    VFMADD     $vr11,    $vr30,    $vr29, $vr11
+
+    vilvl.w   $vr8,     $vr11,   $vr10  //0 1 2 3
+
+    vilvh.w   $vr9,     $vr11,   $vr10  //4 5 6 7
+
+    vst       $vr8,     C0,    0x00
+    vst       $vr9,     C1,    0x00
+#else
+    vld       $vr8,     C0,    0x00  //0 1 2 3
+    vld       $vr9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w $vr10,    $vr9,    $vr8    //0 4 2 6
+    vpermi.w  $vr10,    $vr10,   0xd8    //0 2 4 6
+
+    vpackod.w $vr11,    $vr9,    $vr8    //1 5 3 7
+    vpermi.w  $vr11,    $vr11,   0xd8    //1 3 5 7
+
+    VFMADD      $vr10,    $vr30,    $vr28, $vr10
+    VFMADD      $vr11,    $vr31,    $vr28, $vr11
+    VNMSUB     $vr10,    $vr31,    $vr29, $vr10
+    VFMADD      $vr11,    $vr30,    $vr29, $vr11
+
+    vilvl.w   $vr8,     $vr11,   $vr10  //0 1 2 3
+
+    vilvh.w   $vr9,     $vr11,   $vr10  //4 5 6 7
+
+    vst       $vr8,     C0,    0x00
+    vst       $vr9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x08        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x0c        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x04
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x03
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x08        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x0c        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x08
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x0c
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C1[2]
+    LD         a8,     C0,    0x0c    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x03
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
--- a/kernel/loongarch64/cgemm_kernel_2x2_lsx.S
+++ b/kernel/loongarch64/cgemm_kernel_2x2_lsx.S
@ -0,0 +1,812 @@
+/*******************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+
+/* Function parameters */
+#define M      $r4   // param 1: bm
+#define N      $r5   // param 2: bn
+#define K      $r6   // param 3: bk
+#define ALPHA_R $f0   // param 4: alphar
+#define ALPHA_I $f1   // param 5: alphai
+#define A      $r7   // param 6: ba
+#define B      $r8  // param 7: bb
+#define C      $r9  // param 8: bc
+#define LDC    $r10  // param 9: ldc
+
+#if defined (TRMMKERNEL)
+#define OFFSET $r11  // param 10: offset
+#endif
+#define OFF    $r26
+
+#define I      $r12
+#define J      $r13
+#define L      $r14
+#define TL     $r15
+#define A0     $r16
+#define B0     $r17
+#define C0     $r18
+#define C1     $r19
+#define C2     $r20
+#define C3     $r23
+#define T0     $r24
+#define T1     $r25
+
+#define a1     $f2
+#define a2     $f3
+#define a3     $f4
+#define a4     $f5
+#define a5     $f6
+#define a6     $f7
+#define a7     $f8
+#define a8     $f9
+#define b1     $f10
+#define b2     $f11
+#define b3     $f12
+#define b4     $f13
+#define b5     $f14
+#define b6     $f15
+#define b7     $f16
+#define b8     $f17
+#define c11    $f18
+#define c12    $f19
+#define c21    $f20
+#define c22    $f21
+#define c31    $f22
+#define c32    $f23
+#define c41    $f24
+#define c42    $f25
+
+/* LASX vectors */
+#define U0     $vr30
+#define U1     $vr31
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define U8     $vr8
+#define U9     $vr9
+#define U10    $vr10
+#define U11    $vr11
+#define U12    $vr12
+#define U13    $vr13
+#define U14    $vr14
+#define U15    $vr15
+#define D0     $vr16
+#define D1     $vr17
+#define D2     $vr18
+#define D3     $vr19
+#define D4     $vr20
+#define D5     $vr21
+#define D6     $vr22
+#define D7     $vr23
+#define D8     $vr24
+#define D9     $vr25
+#define D10    $vr26
+#define D11    $vr27
+#define VALPHAR    $vr28
+#define VALPHAI    $vr29
+
+
+#if   defined(NN) || defined(NT) || defined(TN) || defined(TT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VNMSUB
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       NMSUB
+#define    MADD4       MADD
+#endif
+
+#if   defined(NR) || defined(NC) || defined(TR) || defined(TC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VFMADD
+#define    VMADD3       VFMADD
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       MADD
+#define    MADD3       MADD
+#define    MADD4       NMSUB
+#endif
+
+#if   defined(RN) || defined(RT) || defined(CN) || defined(CT)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VFMADD
+#define    VMADD4       VFMADD
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       MADD
+#define    MADD4       MADD
+#endif
+
+#if   defined(RR) || defined(RC) || defined(CR) || defined(CC)
+#define    VMADD1       VFMADD
+#define    VMADD2       VNMSUB
+#define    VMADD3       VNMSUB
+#define    VMADD4       VNMSUB
+
+#define    MADD1       MADD
+#define    MADD2       NMSUB
+#define    MADD3       NMSUB
+#define    MADD4       NMSUB
+#endif
+
+    PROLOGUE
+
+    addi.d     $sp,    $sp,   -128
+    SDARG      $r23,   $sp,   0
+    SDARG      $r24,   $sp,   8
+    SDARG      $r25,   $sp,   16
+    SDARG      $r26,   $sp,   24
+    SDARG      $r27,   $sp,   32
+    ST         $f23,   $sp,   40
+    ST         $f24,   $sp,   48
+    ST         $f25,   $sp,   56
+    ST         $f26,   $sp,   64
+    ST         $f27,   $sp,   72
+    ST         $f28,   $sp,   80
+    ST         $f29,   $sp,   88
+    ST         $f30,   $sp,   96
+    ST         $f31,   $sp,   104
+    ST         ALPHA_R,$sp,   112
+    ST         ALPHA_I,$sp,   120
+
+    vldrepl.w  VALPHAR, $sp, 112
+    vldrepl.w  VALPHAI, $sp, 120
+
+#if defined (TRMMKERNEL) && !defined(LEFT)
+    sub.d      OFF,    $r0,   OFFSET
+#else
+    xor        OFF,    OFF,   OFF
+#endif
+
+    slli.d     LDC,    LDC,   2
+
+    move       J,      $r0
+    srai.d     T0,     N,     1
+    beq        J,      T0,    .L19
+
+.L10:  /* for(j=0; j<bn/2; j+=1) */
+    move       C0,     C
+    slli.d     TL,     LDC,   1
+    add.d      C1,     C0,    TL
+    move       A0,     A    //ptrba
+
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L150
+
+.L11:  /* for(i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B     //ptrbb
+#else
+    slli.d     C3,     OFF,   0x04
+    add.d      A0,     A0,    C3
+    add.d      B0,     B,     C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF   //temp
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    vxor.v    U0,     U0,   U0
+    vxor.v    U1,     U1,   U1
+
+    move       L,      $r0   //cycle param k
+    srai.d     C2,     TL,     2
+    beq        L,      C2,    .L130
+    blt        C2,     L,     .L130
+
+.L12:  /* for(k=0; k<bk/4; k+=1) */
+    vld       D0,     A0,    0x00  //a0 a1 a2 a3
+    vld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x10  //a0 a1 a2 a3
+    vld       D1,     B0,    0x10  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x20  //a0 a1 a2 a3
+    vld       D1,     B0,    0x20  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    vld       D0,     A0,    0x30  //a0 a1 a2 a3
+    vld       D1,     B0,    0x30  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 4 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x40
+    addi.d     B0,     B0,    0x40
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L12
+
+.L130:
+    move       L,      $r0
+    andi       C2,     TL,     3
+    beq        L,      C2,    .L14
+
+.L13:  /* for(k=0; k<(bk&3); k+=1) */
+    vld       D0,     A0,    0x00  //a0 a1 a2 a3
+    vld       D1,     B0,    0x00  //b0 b1 b2 b3
+
+    vshuf4i.w D4,     D1,    0xa0    //b0 b0 b2 b2
+    vshuf4i.w D5,     D1,    0xf5    //b1 b1 b3 b3
+
+    vshuf4i.w  D2,     D0,    0x88  //a0 a2 a0 a2
+    vshuf4i.w  D3,     D0,    0xdd  //a1 a3 a1 a3
+
+    VMADD1      U0,     D2,    D4,     U0  //res0 2 4 6
+    VMADD2      U1,     D3,    D4,     U1  //res1 3 5 7
+    VMADD3      U0,     D3,    D5,     U0
+    VMADD4      U1,     D2,    D5,     U1
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      C2,    .L13
+
+.L14:
+#if defined(TRMMKERNEL)
+    vld       U8,     C0,    0x00  //0 1 2 3
+    vld       U9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w U10,    U9,    U8    //0 4 2 6
+    vpermi.w  U10,    U10,   0xd8  //0 2 4 6
+
+    vpackod.w U11,    U9,    U8    //1 5 3 7
+    vpermi.w  U11,    U11,   0xd8  //1 3 5 7
+
+    vfmul.s      U10,    U0,    VALPHAR
+    vfmul.s      U11,    U1,    VALPHAR
+    VNMSUB     U10,    U1,    VALPHAI, U10
+    VFMADD      U11,    U0,    VALPHAI, U11
+
+    vilvl.w   U8,     U11,   U10  //0 1 2 3
+
+    vilvh.w   U9,     U11,   U10  //4 5 6 7
+
+    vst       U8,     C0,    0x00
+    vst       U9,     C1,    0x00
+#else
+    vld       U8,     C0,    0x00  //0 1 2 3
+    vld       U9,     C1,    0x00  //4 5 6 7
+
+    vpackev.w U10,    U9,    U8    //0 4 2 6
+    vpermi.w  U10,    U10,   0xd8  //0 2 4 6
+
+    vpackod.w U11,    U9,    U8    //1 5 3 7
+    vpermi.w  U11,    U11,   0xd8  //1 3 5 7
+
+    VFMADD      U10,    U0,    VALPHAR, U10
+    VFMADD      U11,    U1,    VALPHAR, U11
+    VNMSUB     U10,    U1,    VALPHAI, U10
+    VFMADD      U11,    U0,    VALPHAI, U11
+
+    vilvl.w   U8,     U11,   U10  //0 1 2 3
+
+    vilvh.w   U9,     U11,   U10  //4 5 6 7
+
+    vst       U8,     C0,    0x00
+    vst       U9,     C1,    0x00
+#endif
+
+#if defined(TRMMKERNEL)
+
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+    addi.d     C1,     C1,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L11
+
+.L150:
+    move       I,      $r0
+    andi       T0,     M,     1
+    beq        I,      T0,    .L18
+
+.L15:  /* for(i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x04
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   2
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L17
+    blt        TL,     L,     .L17
+
+.L16:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         b3,     B0,    0x08        //load4
+    MADD1      c21,    a1,    b3,     c21  //res2
+    MADD2      c22,    a2,    b3,     c22  //res3
+    LD         b4,     B0,    0x0c        //load5
+    MADD3      c21,    a2,    b4,     c21
+    MADD4      c22,    a1,    b4,     c22
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x10
+
+    addi.d     L,      L,     1
+    blt        L,      TL,     .L16
+
+.L17:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        b5,     c21,   ALPHA_R
+    MUL        b6,     c22,   ALPHA_I
+    SUB        b5,     b5,    b6
+    ST         b5,     C1,    0x00
+
+    MUL        b5,     c22,   ALPHA_R
+    MUL        b6,     c21,   ALPHA_I
+    ADD        b6,     b5,    b6
+    ST         b6,     C1,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         b5,     C1,    0x00    //C1[0]
+    LD         b6,     C1,    0x04    //C1[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+
+    MADD       b5,     c21,   ALPHA_R, b5
+    MADD       b6,     c22,   ALPHA_R, b6
+    NMSUB      b5,     c22,   ALPHA_I, b5
+    MADD       b6,     c21,   ALPHA_I, b6
+    ST         b5,     C1,    0x00
+    ST         b6,     C1,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -2
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x04
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+    addi.d     C1,     C1,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L15
+
+.L18:
+#if defined(TRMMKERNEL) && !defined(LEFT)
+    addi.d     OFF,    OFF,   2
+#endif
+
+    slli.d     L,      K,     0x04
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   0x02
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    srai.d     T0,     N,     1
+    blt        J,      T0,    .L10
+
+.L19:
+    move       J,      $r0
+    andi       T0,     N,     1
+    beq        J,      T0,    .L30
+
+.L20: /* for (j=0; j<(bn&1); j+=1) */
+#if defined(TRMMKERNEL) && defined(LEFT)
+    move       OFF,    OFFSET
+#endif
+
+    move       C0,     C
+    move       A0,     A    //ptrba
+
+    move       I,      $r0
+    srai.d     T0,     M,     1
+    beq        I,      T0,    .L24
+
+.L21:  /* for (i=0; i<bm/2; i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,  0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     OFF,  0x03
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   2
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+    MTC        c21,    $r0
+    MTC        c22,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L23
+    blt        TL,     L,     .L23
+
+.L22:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+    LD         a3,     A0,    0x08        //load4
+    MADD1      c21,    a3,    b1,     c21  //res2
+    LD         a4,     A0,    0x0c        //load5
+    MADD2      c22,    a4,    b1,     c22  //res3
+    MADD3      c21,    a4,    b2,     c21
+    MADD4      c22,    a3,    b2,     c22
+
+    addi.d     A0,     A0,    0x10
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L22
+
+.L23:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+
+    MUL        a7,     c21,   ALPHA_R
+    MUL        a8,     c22,   ALPHA_I
+    SUB        a7,     a7,    a8
+    ST         a7,     C0,    0x08
+
+    MUL        a7,     c22,   ALPHA_R
+    MUL        a8,     c21,   ALPHA_I
+    ADD        a8,     a7,    a8
+    ST         a8,     C0,    0x0c
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+    LD         a7,     C0,    0x08    //C1[2]
+    LD         a8,     C0,    0x0c    //C1[3]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+    MADD       a7,     c21,   ALPHA_R, a7
+    MADD       a8,     c22,   ALPHA_R, a8
+    NMSUB      a7,     c22,   ALPHA_I, a7
+    MADD       a8,     c21,   ALPHA_I, a8
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+    ST         a7,     C0,    0x08
+    ST         a8,     C0,    0x0c
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -2
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x04
+    add.d      A0,     A0,   C3
+    slli.d     C3,     TL,   0x03
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   2
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x10
+
+    addi.d     I,      I,     1
+    blt        I,      T0,    .L21
+
+.L24:
+    move       I,      $r0
+    andi       T1,     M,     1    //bm&1
+    beq        I,      T1,    .L28
+
+.L25:  /* for (i=0; i<(bm&1); i+=1) */
+    move       B0,     B      //ptrbb
+    move       TL,     K      /* TL = bk */
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) &&  defined(TRANSA)) ||(!defined(LEFT) && !defined(TRANSA))
+    move       B0,     B    //ptrbb
+#else
+    slli.d     C3,     OFF,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B,    C3
+#endif
+
+#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
+    sub.d      TL,     K,     OFF
+#elif defined(LEFT)
+    addi.d     TL,     OFF,   1
+#else
+    addi.d     TL,     OFF,   1
+#endif
+
+#endif  // #if defined(TRMMKERNEL)
+
+    MTC        c11,    $r0
+    MTC        c12,    $r0
+
+    move       L,      $r0   //cycle param k
+    beq        L,      TL,    .L27
+    blt        TL,     L,     .L27
+
+.L26:  /* for (k=0; k<bk; k+=1) */
+    LD         a1,     A0,    0x00        //load0
+    LD         b1,     B0,    0x00        //load1
+    MADD1      c11,    a1,    b1,     c11  //res0
+    LD         a2,     A0,    0x04        //load2
+    MADD2      c12,    a2,    b1,     c12  //res1
+    LD         b2,     B0,    0x04        //load3
+    MADD3      c11,    a2,    b2,     c11
+    MADD4      c12,    a1,    b2,     c12
+
+    addi.d     A0,     A0,    0x08
+    addi.d     B0,     B0,    0x08
+
+    addi.d     L,      L,     1
+    blt        L,      TL,    .L26
+
+.L27:
+#if defined(TRMMKERNEL)
+    MUL        a5,     c11,   ALPHA_R
+    MUL        a6,     c12,   ALPHA_I
+    SUB        a5,     a5,    a6
+    ST         a5,     C0,    0x00
+
+    MUL        a5,     c12,   ALPHA_R
+    MUL        a6,     c11,   ALPHA_I
+    ADD        a6,     a5,    a6
+    ST         a6,     C0,    0x04
+#else
+    LD         a5,     C0,    0x00    //C0[0]
+    LD         a6,     C0,    0x04    //C0[1]
+
+    MADD       a5,     c11,   ALPHA_R, a5
+    MADD       a6,     c12,   ALPHA_R, a6
+    NMSUB      a5,     c12,   ALPHA_I, a5
+    MADD       a6,     c11,   ALPHA_I, a6
+
+    ST         a5,     C0,    0x00
+    ST         a6,     C0,    0x04
+#endif
+
+#if defined(TRMMKERNEL)
+#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
+    sub.d      TL,     K,    OFF
+#ifdef LEFT
+    addi.d     TL,     TL,   -1
+#else
+    addi.d     TL,     TL,   -1
+#endif
+    slli.d     C3,     TL,   0x03
+    add.d      A0,     A0,   C3
+    add.d      B0,     B0,   C3
+#endif
+
+#ifdef LEFT
+    addi.d     OFF,    OFF,   1
+#endif
+#endif   // #if defined(TRMMKERNEL)
+
+    addi.d     C0,     C0,    0x08
+
+    addi.d     I,      I,     1
+    blt        I,      T1,    .L25
+
+.L28:
+    slli.d     L,      K,     3
+    add.d      B,      B,     L
+
+    slli.d     I,      LDC,   1
+    add.d      C,      C,     I
+
+    addi.d     J,      J,     1
+    andi       T0,     N,     1
+    blt        J,      T0,    .L20
+
+.L30:
+    LDARG      $r23,   $sp,   0
+    LDARG      $r24,   $sp,   8
+    LDARG      $r25,   $sp,   16
+    LDARG      $r26,   $sp,   24
+    LDARG      $r27,   $sp,   32
+    LD         $f23,   $sp,   40
+    LD         $f24,   $sp,   48
+    LD         $f25,   $sp,   56
+    LD         $f26,   $sp,   64
+    LD         $f27,   $sp,   72
+    LD         $f28,   $sp,   80
+    LD         $f29,   $sp,   88
+    LD         $f30,   $sp,   96
+    LD         $f31,   $sp,   104
+
+    addi.d     $sp,    $sp,   128
+    jirl       $r0,    $r1,   0x0
+
+    EPILOGUE
--- a/kernel/loongarch64/cgemm_kernel_8x4_lsx.S
+++ b/kernel/loongarch64/cgemm_kernel_8x4_lsx.S
--- a/kernel/loongarch64/cgemm_ncopy_2_lasx.S
+++ b/kernel/loongarch64/cgemm_ncopy_2_lasx.S
@ -0,0 +1,193 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $xr0
+#define U1     $xr1
+#define U2     $xr2
+#define U3     $xr3
+#define U4     $xr4
+#define U5     $xr5
+#define U6     $xr6
+#define U7     $xr7
+#define D0     $xr8
+#define D1     $xr9
+#define D2     $xr10
+#define D3     $xr11
+#define D4     $xr12
+#define D5     $xr13
+#define D6     $xr14
+#define D7     $xr15
+#define D8     $xr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    xvld       U0,   S1,   0x00
+    xvld       U1,   S1,   0x00
+    xvld       U2,   S2,   0x00
+
+    xvpermi.q  U0,   U2,   0x02
+    xvpermi.q  U2,   U1,   0x31
+
+    xvpermi.d  U0,   U0,   0xd8
+    xvpermi.d  U2,   U2,   0xd8
+
+    xvst       U0,   TD,   0x00
+    xvst       U2,   TD,   0x20
+
+    addi.d     S1,   S1,   0x20   // a_offset1
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    xvld       U0,    TS,   0x00
+
+    xvst       U0,    TD,   0x00
+
+    addi.d     TS,    TS,   0x20   // a_offset
+    addi.d     TD,    TD,   0x20   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,    TS,   0x00
+    fld.s      F1,    TS,   0x04
+
+    fst.s      F0,    TD,   0x00
+    fst.s      F1,    TD,   0x04
+
+    addi.d     TS,    TS,   0x08   // a_offset
+    addi.d     TD,    TD,   0x08   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
--- a/kernel/loongarch64/cgemm_ncopy_2_lsx.S
+++ b/kernel/loongarch64/cgemm_ncopy_2_lsx.S
@ -0,0 +1,202 @@
+/*******************************************************************************
+Copyright (c) 2021, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*******************************************************************************/
+#define ASSEMBLER
+
+#include "common.h"
+
+/* Function parameters */
+#define M      $r4    // param 1: m
+#define N      $r5    // param 2: n
+#define SRC    $r6    // param 3: src
+#define LDA    $r7    // param 4: lda
+#define DST    $r8    // param 5: dst
+
+#define I      $r9
+#define J      $r10
+#define S1     $r12
+#define S2     $r13
+#define S3     $r14
+#define S4     $r15
+#define S5     $r16
+#define S6     $r17
+#define S7     $r18
+#define S8     $r19
+#define TD     $r20
+#define TS     $r11
+#define TL     $r7
+#define T0     $r23
+#define ZERO   $r0
+
+#define F0     $f0
+#define F1     $f1
+#define F2     $f2
+#define F3     $f3
+#define F4     $f4
+#define F5     $f5
+#define F6     $f6
+#define F7     $f7
+
+/* LASX vectors */
+#define U0     $vr0
+#define U1     $vr1
+#define U2     $vr2
+#define U3     $vr3
+#define U4     $vr4
+#define U5     $vr5
+#define U6     $vr6
+#define U7     $vr7
+#define D0     $vr8
+#define D1     $vr9
+#define D2     $vr10
+#define D3     $vr11
+#define D4     $vr12
+#define D5     $vr13
+#define D6     $vr14
+#define D7     $vr15
+#define D8     $vr16
+
+    PROLOGUE
+
+    addi.d     $sp,  $sp,  -8
+    SDARG      $r23, $sp,  0
+
+    move       TD,   DST   //boffset
+    move       TS,   SRC   //aoffset
+
+    slli.d     TL,   LDA,  0x02  //lda
+    slli.d     TL,   TL,   0x01
+    slli.d     T0,   TL,   0x01
+
+    srai.d     I,    N,    0x01
+    beq        I,    ZERO,  .L_N0
+
+.L_J1: /* if (i > 0) I-- */
+    move       S1,   TS         //a_offset1
+    add.d      S2,   TS,   TL   //a_offset2
+    srai.d     J,    M,    0x02
+    add.d      TS,   TS,   T0
+
+    beq        J,    ZERO,  .L_I3
+
+.L_I1: /* if (j > 0) J-- */
+    vld       U0,   S1,   0x00
+    vld       U1,   S1,   0x10
+    vld       U2,   S2,   0x00
+    vld       U3,   S2,   0x10
+
+    vand.v    D0,   U2,   U2
+    vand.v    D1,   U3,   U3
+    vand.v    D2,   U2,   U2
+    vand.v    D3,   U3,   U3
+
+    vpermi.w  D0,   U0,   0x44
+    vpermi.w  D2,   U0,   0xee
+    vpermi.w  D1,   U1,   0x44
+    vpermi.w  D3,   U1,   0xee
+
+    vst       D0,   TD,   0x00
+    vst       D2,   TD,   0x10
+    vst       D1,   TD,   0x20
+    vst       D3,   TD,   0x30
+
+    addi.d     S1,   S1,   0x20   // a_offset1
+    addi.d     S2,   S2,   0x20
+    addi.d     TD,   TD,   0x40  // b_offset
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_I1
+
+.L_I3:
+    andi       J,    M,    0x03
+    beq        J,    ZERO, .L_II20
+
+.L_II1:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,   S1,   0x00
+    fld.s      F1,   S1,   0x04
+    fld.s      F2,   S2,   0x00
+    fld.s      F3,   S2,   0x04
+
+    fst.s      F0,   TD,   0x00
+    fst.s      F1,   TD,   0x04
+    fst.s      F2,   TD,   0x08
+    fst.s      F3,   TD,   0x0c
+
+    addi.d     S1,   S1,   0x08
+    addi.d     S2,   S2,   0x08
+    addi.d     TD,   TD,   0x10
+
+    addi.d     J,    J,    -1
+    blt        ZERO, J,    .L_II1
+
+.L_II20:
+    addi.d     I,    I,    -1
+    blt        ZERO, I,    .L_J1
+
+.L_N0:  /* if(n&1)*/
+    andi       I,     N,   0x01
+    beq        ZERO,  I,   .L_N00
+
+.L_N1:
+    srai.d     J,     M,   0x02
+    beq        ZERO,  J,   .L_N10
+
+.L_N11: /* j = (m >> 2) if (j > 0) */
+    vld       U0,    TS,   0x00
+    vld       U1,    TS,   0x10
+
+    vst       U0,    TD,   0x00
+    vst       U1,    TD,   0x10
+
+    addi.d     TS,    TS,   0x20   // a_offset
+    addi.d     TD,    TD,   0x20   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N11
+
+.L_N10:
+    andi       J,     M,    0x03
+    beq        J,     ZERO, .L_N00
+
+.L_N12:  /* j = (m & 3)  if (j > 0) */
+    fld.s      F0,    TS,   0x00
+    fld.s      F1,    TS,   0x04
+
+    fst.s      F0,    TD,   0x00
+    fst.s      F1,    TD,   0x04
+
+    addi.d     TS,    TS,   0x08   // a_offset
+    addi.d     TD,    TD,   0x08   // b_offset
+
+    addi.d     J,     J,   -1
+    blt        ZERO,  J,   .L_N12
+
+.L_N00:
+    LDARG     $r23,  $sp, 0
+    addi.d    $sp,   $sp, 8
+    jirl      $r0,   $r1, 0x00
+
+    EPILOGUE
--- a/Show More
+++ b/Show More