diff --git a/benchmark/trsv.c b/benchmark/trsv.c
index 66ac3a3c7..e17c57157 100644
--- a/benchmark/trsv.c
+++ b/benchmark/trsv.c
@@ -127,7 +127,7 @@ int main(int argc, char *argv[]){
       long long muls = n*(n+1)/2.0;
       long long adds = (n - 1.0)*n/2.0;
 
-      fprintf(stderr, "%10d   %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
+      fprintf(stderr, "%10d :   %10.2f MFlops %10.6f sec\n", n,(muls+adds) / timeg * 1.e-6, timeg);
       if(a != NULL){
         free(a);
       }
diff --git a/c_check b/c_check
index b018c10a8..b5e4a9ad0 100755
--- a/c_check
+++ b/c_check
@@ -199,8 +199,7 @@ if [ "$architecture" = "loongarch64" ]; then
     tmpd="$(mktemp -d)"
     tmplsx="$tmpd/lsx.c"
     codelsx='"vadd.b $vr0, $vr0, $vr0"'
-    lsx_flags='-march=loongarch64 -mlsx'
-    printf "#include <lsxintrin.h>\n\n" >> "$tmplsx"
+    lsx_flags='-march=loongarch64'
     printf "void main(void){ __asm__ volatile(%s);}\n" "$codelsx" >> "$tmplsx"
     args="$lsx_flags -o $tmplsx.o $tmplsx"
     {
@@ -211,8 +210,7 @@ if [ "$architecture" = "loongarch64" ]; then
 
     tmplasx="$tmpd/lasx.c"
     codelasx='"xvadd.b $xr0, $xr0, $xr0"'
-    lasx_flags='-march=loongarch64 -mlasx'
-    printf "#include <lasxintrin.h>\n\n" >> "$tmplasx"
+    lasx_flags='-march=loongarch64'
     printf "void main(void){ __asm__ volatile(%s);}\n" "$codelasx" >> "$tmplasx"
     args="$lasx_flags -o $tmplasx.o $tmplasx"
     {
diff --git a/c_check.pl b/c_check.pl
index 7a860a211..d9c36793c 100644
--- a/c_check.pl
+++ b/c_check.pl
@@ -241,8 +241,7 @@ if (($architecture eq "loongarch64")) {
     } else {
 	$tmplsx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$codelsx = '"vadd.b $vr0, $vr0, $vr0"';
-	$lsx_flags = "-march=loongarch64 -mlsx";
-	print $tmplsx "#include <lsxintrin.h>\n\n";
+	$lsx_flags = "-march=loongarch64";
 	print $tmplsx "void main(void){ __asm__ volatile($codelsx); }\n";
 
 	$args = "$lsx_flags -o $tmplsx.o $tmplsx";
@@ -257,8 +256,7 @@ if (($architecture eq "loongarch64")) {
 
 	$tmplasx = new File::Temp( SUFFIX => '.c' , UNLINK => 1 );
 	$codelasx = '"xvadd.b $xr0, $xr0, $xr0"';
-	$lasx_flags = "-march=loongarch64 -mlasx";
-	print $tmplasx "#include <lasxintrin.h>\n\n";
+	$lasx_flags = "-march=loongarch64";
 	print $tmplasx "void main(void){ __asm__ volatile($codelasx); }\n";
 
 	$args = "$lasx_flags -o $tmplasx.o $tmplasx";
diff --git a/common_loongarch64.h b/common_loongarch64.h
index ce1fcf091..4963b2f07 100644
--- a/common_loongarch64.h
+++ b/common_loongarch64.h
@@ -124,7 +124,17 @@ static inline int WhereAmI(void){
 #define CMPLE   fcmp.cle.d
 #define CMPLT   fcmp.clt.d
 #define NEG     fneg.d
+
+#define XVFSUB  xvfsub.d
+#define XVFADD  xvfadd.d
+#define XVFMADD xvfmadd.d
+
+#define VFSUB  vfsub.d
+#define VFADD  vfadd.d
+#define VFMADD vfmadd.d
+
 #else
+
 #define LD      fld.s
 #define ST      fst.s
 #define MADD    fmadd.s
@@ -142,6 +152,15 @@ static inline int WhereAmI(void){
 #define CMPLE   fcmp.cle.s
 #define CMPLT   fcmp.clt.s
 #define NEG     fneg.s
+
+#define XVFSUB  xvfsub.s
+#define XVFADD  xvfadd.s
+#define XVFMADD xvfmadd.s
+
+#define VFSUB  vfsub.s
+#define VFADD  vfadd.s
+#define VFMADD vfmadd.s
+
 #endif /* defined(DOUBLE) */
 
 #if defined(__64BIT__) && defined(USE64BITINT)
diff --git a/cpuid_loongarch64.c b/cpuid_loongarch64.c
index 7c389db27..0ad32ae4e 100644
--- a/cpuid_loongarch64.c
+++ b/cpuid_loongarch64.c
@@ -47,8 +47,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #define CPU_LOONGSON3R5    1
 #define CPU_LOONGSON2K1000 2
 
-#define LA_HWCAP_LSX    (1<<4)
-#define LA_HWCAP_LASX   (1<<5)
+#define LA_HWCAP_LSX    (1U << 4)
+#define LA_HWCAP_LASX   (1U << 5)
 
 static char *cpuname[] = {
   "LOONGSONGENERIC",
@@ -64,11 +64,11 @@ static char *cpuname_lower[] = {
 
 int detect(void) {
 #ifdef __linux
-  int flag  = (int)getauxval(AT_HWCAP);
+  int hwcap  = (int)getauxval(AT_HWCAP);
 
-  if (flag & LA_HWCAP_LASX)
+  if (hwcap & LA_HWCAP_LASX)
     return CPU_LOONGSON3R5;
-  else if (flag & LA_HWCAP_LSX)
+  else if (hwcap & LA_HWCAP_LSX)
     return CPU_LOONGSON2K1000;
   else
     return CPU_GENERIC;
@@ -94,7 +94,9 @@ void get_subdirname(void) {
 }
 
 void get_cpuconfig(void) {
+  uint32_t hwcaps = 0;
   int d = detect();
+
   switch (d) {
     case CPU_LOONGSON3R5:
       printf("#define LOONGSON3R5\n");
@@ -129,6 +131,10 @@ void get_cpuconfig(void) {
       printf("#define L2_ASSOCIATIVE 16\n");
     break;
   }
+
+  hwcaps = (uint32_t)getauxval( AT_HWCAP );
+  if (hwcaps & LA_HWCAP_LSX)      printf("#define HAVE_LSX\n");
+  if (hwcaps & LA_HWCAP_LASX)     printf("#define HAVE_LASX\n");
 }
 
 void get_libname(void){
diff --git a/driver/others/dynamic_loongarch64.c b/driver/others/dynamic_loongarch64.c
index 52f8bcb2f..44de59669 100644
--- a/driver/others/dynamic_loongarch64.c
+++ b/driver/others/dynamic_loongarch64.c
@@ -25,6 +25,7 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 
+#include <sys/auxv.h>
 #include "common.h"
 
 extern gotoblas_t  gotoblas_LOONGSON3R5;
@@ -74,21 +75,15 @@ static gotoblas_t *force_coretype(char *coretype) {
   return NULL;
 }
 
-#define LASX_MASK       1<<7
-#define LSX_MASK        1<<6
-#define LOONGARCH_CFG2  0x02
+#define LA_HWCAP_LSX    (1U << 4)
+#define LA_HWCAP_LASX   (1U << 5)
 
 static gotoblas_t *get_coretype(void) {
-  int ret = 0;
-  __asm__ volatile (
-    "cpucfg %0, %1 \n\t"
-    : "+&r"(ret)
-    : "r"(LOONGARCH_CFG2)
-  );
+  int hwcap = (int)getauxval(AT_HWCAP);
 
-  if (ret & LASX_MASK)
+  if (hwcap & LA_HWCAP_LASX)
     return &gotoblas_LOONGSON3R5;
-  else if (ret & LSX_MASK)
+  else if (hwcap & LA_HWCAP_LSX)
     return &gotoblas_LOONGSON2K1000;
   else
     return &gotoblas_LOONGSONGENERIC;
diff --git a/kernel/loongarch64/KERNEL.LOONGSON2K1000 b/kernel/loongarch64/KERNEL.LOONGSON2K1000
new file mode 100644
index 000000000..b2a396674
--- /dev/null
+++ b/kernel/loongarch64/KERNEL.LOONGSON2K1000
@@ -0,0 +1,7 @@
+ifndef NO_LSX
+
+SDOTKERNEL  = dot_lsx.S
+DSDOTKERNEL = dot_lsx.S
+DDOTKERNEL  = dot_lsx.S
+
+endif
diff --git a/kernel/loongarch64/KERNEL.LOONGSON3R5 b/kernel/loongarch64/KERNEL.LOONGSON3R5
index 011e8b89e..020a82303 100644
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -1,4 +1,9 @@
 ifndef NO_LASX
+
+SDOTKERNEL  = dot_lasx.S
+DSDOTKERNEL = dot_lasx.S
+DDOTKERNEL  = dot_lasx.S
+
 DGEMMKERNEL    = dgemm_kernel_16x4.S
 DGEMMINCOPY    = dgemm_ncopy_16.S
 DGEMMITCOPY    = dgemm_tcopy_16.S
diff --git a/kernel/loongarch64/dot_lasx.S b/kernel/loongarch64/dot_lasx.S
new file mode 100644
index 000000000..0715b6311
--- /dev/null
+++ b/kernel/loongarch64/dot_lasx.S
@@ -0,0 +1,368 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r17
+#define TEMP   $r18
+
+/* Don't change following FR unless you know the effects. */
+#define s1     $f8
+#define s2     $f9
+#define a1     $f10
+#define b1     $f11
+
+PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT     N,      0(N)
+    LDINT     INCX,   0(INCX)
+    LDINT     INCY,   0(INCY)
+#endif
+
+    /* init $f8 and $f9 to zero */
+    SUB       s1,     s1,      s1
+    SUB       s2,     s2,      s2
+    slli.d    INCX,   INCX,    BASE_SHIFT
+    li.d      TEMP,   SIZE
+    slli.d    INCY,   INCY,    BASE_SHIFT
+    bge       $r0,    N,       .L999
+    bne       INCX,   TEMP,    .L20   /* inc_x=1 */
+    bne       INCY,   TEMP,    .L20   /* inc_y=1 */
+
+    /* !((inc_x == 1) && (inc_y == 1)) */
+
+    /* init $xr8 and $xr9 to zero */
+#ifdef DOUBLE
+    xvldrepl.d $xr0,  X,       0
+#else
+    xvldrepl.w $xr0,  X,       0
+#endif
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr0,    $xr0
+    xvfsub.d  $xr8,   $xr0,    $xr0
+    xvfsub.d  $xr9,   $xr0,    $xr0
+#else
+    XVFSUB    $xr8,   $xr0,    $xr0
+    XVFSUB    $xr9,   $xr0,    $xr0
+#endif
+
+#ifdef DOUBLE
+    srai.d    I,      N,       4
+#else
+    srai.d    I,      N,       5
+#endif
+    bge       $r0,    I,       .L12   /* FLOAT: <32 ; DOUBLE: <16 */
+    .align  3
+.L11:
+    /* FLOAT: 32~ ; DOUBLE: 16~ */
+    xvld      $xr0,   X,       0
+    xvld      $xr1,   X,       32
+    xvld      $xr2,   X,       64
+    xvld      $xr3,   X,       96
+    xvld      $xr4,   Y,       0
+    xvld      $xr5,   Y,       32
+    xvld      $xr6,   Y,       64
+    xvld      $xr7,   Y,       96
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       128
+    addi.d    Y,      Y,       128
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr10,   $xr0
+    xvfcvtl.d.s       $xr11,   $xr4
+    xvfcvth.d.s       $xr12,   $xr0
+    xvfcvth.d.s       $xr13,   $xr4
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr1
+    xvfcvtl.d.s       $xr11,   $xr5
+    xvfcvth.d.s       $xr12,   $xr1
+    xvfcvth.d.s       $xr13,   $xr5
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr2
+    xvfcvtl.d.s       $xr11,   $xr6
+    xvfcvth.d.s       $xr12,   $xr2
+    xvfcvth.d.s       $xr13,   $xr6
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+    xvfcvtl.d.s       $xr10,   $xr3
+    xvfcvtl.d.s       $xr11,   $xr7
+    xvfcvth.d.s       $xr12,   $xr3
+    xvfcvth.d.s       $xr13,   $xr7
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+#else
+    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
+    XVFMADD   $xr9,   $xr1,    $xr5,   $xr9
+    XVFMADD   $xr8,   $xr2,    $xr6,   $xr8
+    XVFMADD   $xr9,   $xr3,    $xr7,   $xr9
+#endif
+    bnez      I,      .L11
+    .align  3
+.L12:
+#ifdef DOUBLE
+    andi      I,      N,       0xf
+    srai.d    I,      I,       2
+#else
+    andi      I,      N,       0x1f
+    srai.d    I,      I,       3
+#endif
+    bge       $r0,    I,       .L14   /* DOUBLE: <4 ; FLOAT: <8 */
+    .align  3
+.L13:
+    /* FLOAT: 8~31 ; DOUBLE: 4~15 */
+    xvld      $xr0,   X,       0
+    xvld      $xr4,   Y,       0
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       32
+    addi.d    Y,      Y,       32
+#ifdef DSDOT
+    xvfcvtl.d.s       $xr10,   $xr0
+    xvfcvtl.d.s       $xr11,   $xr4
+    xvfcvth.d.s       $xr12,   $xr0
+    xvfcvth.d.s       $xr13,   $xr4
+    xvfmadd.d $xr8,   $xr10,   $xr12,  $xr8
+    xvfmadd.d $xr9,   $xr11,   $xr13,  $xr9
+#else
+    XVFMADD   $xr8,   $xr0,    $xr4,   $xr8
+#endif
+    bnez      I,      .L13
+    .align  3
+.L14:
+    /* store dot in s1 $f8 */
+#ifdef DSDOT
+    xvfadd.d  $xr8,   $xr8,    $xr9
+    fsub.s    s2,     s2,      s2,  /* set s2 to 0.0 */
+    xvpermi.q $xr0,   $xr8,    0x1
+    vfadd.d   $vr8,   $vr8,    $vr0
+    vpackod.d $vr0,   $vr8,    $vr8
+    vfadd.d   $vr8,   $vr8,    $vr0
+#else
+    XVFADD    $xr8,   $xr8,    $xr9
+    SUB       s2,     s2,      s2   /* set s2 to 0.0 */
+    xvpermi.q $xr0,   $xr8,    0x1
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.d $vr0,   $vr8,    $vr8
+#ifdef DOUBLE
+    VFADD     $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.w $vr0,   $vr8,    $vr8
+    VFADD     $vr8,   $vr8,    $vr0
+#endif /* defined DOUBLE */
+#endif /* defined DSDOT */
+    .align  3
+.L15:
+#ifdef DOUBLE
+    andi      I,      N,       0x3
+#else
+    andi      I,      N,       0x7
+#endif
+    bge       $r0,    I,       .L999  /* =0 */
+   .align  3
+.L16:
+    /* FLOAT: 1~7 ; DOUBLE: 1~3 */
+    LD        a1,     X,       0
+    LD        b1,     Y,       0
+#ifdef DSDOT
+    fcvt.d.s  a1,     a1
+    fcvt.d.s  b1,     b1
+    fmadd.d   s1,     b1,      a1,     s1
+#else
+    MADD      s1,     b1,      a1,     s1
+#endif
+    addi.d    I,      I,       -1
+    addi.d    X,      X,       SIZE
+    addi.d    Y,      Y,       SIZE
+    bnez      I,      .L16
+    b         .L999
+    .align 3
+
+.L20:
+/* !((inc_x == 1) && (inc_y == 1)) */
+    srai.d    I,      N,       3
+#ifdef F_INTERFACE
+    bgez    INCX, .L21
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCX
+    mflo    TEMP
+    dsub    X, X, TEMP
+    .align 3
+
+.L21:
+    bgez    INCY, .L22
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCY
+    mflo    TEMP
+    dsub    Y, Y, TEMP
+    .align 3
+
+.L22:
+#endif
+    bge $r0,    I, .L25  /* <8 */
+    .align 3
+
+.L23:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+    blt $r0,    I, .L23
+    .align 3
+
+.L25:
+    andi    I,  N, 7
+    bge $r0,    I, .L999
+    .align  3
+
+.L26:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+    blt $r0,    I, .L26
+    .align 3
+
+.L999:
+#ifdef DSDOT
+    fadd.d    $f0,    s1,      s2
+#else
+    ADD       $f0,    s1,      s2
+#endif
+    move      $r4,    $r17
+    jirl      $r0,    $r1,     0x0
+
+EPILOGUE
diff --git a/kernel/loongarch64/dot_lsx.S b/kernel/loongarch64/dot_lsx.S
new file mode 100644
index 000000000..f7f613553
--- /dev/null
+++ b/kernel/loongarch64/dot_lsx.S
@@ -0,0 +1,364 @@
+/***************************************************************************
+Copyright (c) 2023, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+#define ASSEMBLER
+
+#include "common.h"
+
+#define N      $r4
+#define X      $r5
+#define INCX   $r6
+#define Y      $r7
+#define INCY   $r8
+
+#define I      $r17
+#define TEMP   $r18
+
+/* Don't change following FR unless you know the effects. */
+#define s1     $f8
+#define s2     $f9
+#define a1     $f10
+#define b1     $f11
+
+PROLOGUE
+
+#ifdef F_INTERFACE
+    LDINT     N,      0(N)
+    LDINT     INCX,   0(INCX)
+    LDINT     INCY,   0(INCY)
+#endif
+
+    /* init $f8 and $f9 to zero */
+    SUB       s1,     s1,      s1
+    SUB       s2,     s2,      s2
+    slli.d    INCX,   INCX,    BASE_SHIFT
+    li.d      TEMP,   SIZE
+    slli.d    INCY,   INCY,    BASE_SHIFT
+    bge       $r0,    N,       .L999
+    bne       INCX,   TEMP,    .L20   /* inc_x=1 */
+    bne       INCY,   TEMP,    .L20   /* inc_y=1 */
+
+    /* !((inc_x == 1) && (inc_y == 1)) */
+
+    /* init $vr8 and $vr9 to zero */
+#ifdef DOUBLE
+    vldrepl.d  $vr0,  X,       0
+#else
+    vldrepl.w  $vr0,  X,       0
+#endif
+#ifdef DSDOT
+    vfcvtl.d.s        $vr0,    $vr0
+    vfsub.d   $vr8,   $vr0,    $vr0
+    vfsub.d   $vr9,   $vr0,    $vr0
+#else
+    VFSUB     $vr8,   $vr0,    $vr0
+    VFSUB     $vr9,   $vr0,    $vr0
+#endif
+
+#ifdef DOUBLE
+    srai.d    I,      N,       3
+#else
+    srai.d    I,      N,       4
+#endif
+    bge       $r0,    I,       .L12   /* FLOAT: <16 ; DOUBLE: <8 */
+    .align  3
+.L11:
+    /* FLOAT: 16~ ; DOUBLE: 8~ */
+    vld       $vr0,   X,       0
+    vld       $vr1,   X,       16
+    vld       $vr2,   X,       32
+    vld       $vr3,   X,       48
+    vld       $vr4,   Y,       0
+    vld       $vr5,   Y,       16
+    vld       $vr6,   Y,       32
+    vld       $vr7,   Y,       48
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       64
+    addi.d    Y,      Y,       64
+#ifdef DSDOT
+    vfcvtl.d.s        $vr10,   $vr0
+    vfcvtl.d.s        $vr11,   $vr4
+    vfcvth.d.s        $vr12,   $vr0
+    vfcvth.d.s        $vr13,   $vr4
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr1
+    vfcvtl.d.s        $vr11,   $vr5
+    vfcvth.d.s        $vr12,   $vr1
+    vfcvth.d.s        $vr13,   $vr5
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr2
+    vfcvtl.d.s        $vr11,   $vr6
+    vfcvth.d.s        $vr12,   $vr2
+    vfcvth.d.s        $vr13,   $vr6
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+    vfcvtl.d.s        $vr10,   $vr3
+    vfcvtl.d.s        $vr11,   $vr7
+    vfcvth.d.s        $vr12,   $vr3
+    vfcvth.d.s        $vr13,   $vr7
+    vfmadd.d $vr8,    $vr10,   $vr12,  $vr8
+    vfmadd.d $vr9,    $vr11,   $vr13,  $vr9
+#else
+    VFMADD    $vr8,   $vr0,    $vr4,   $vr8
+    VFMADD    $vr9,   $vr1,    $vr5,   $vr9
+    VFMADD    $vr8,   $vr2,    $vr6,   $vr8
+    VFMADD    $vr9,   $vr3,    $vr7,   $vr9
+#endif
+    bnez      I,      .L11
+    .align  3
+.L12:
+#ifdef DOUBLE
+    andi      I,      N,       0x7
+    srai.d    I,      I,       1
+#else
+    andi      I,      N,       0xf
+    srai.d    I,      I,       2
+#endif
+    bge       $r0,    I,       .L14   /* DOUBLE: <2 ; FLOAT: <4 */
+    .align  3
+.L13:
+    /* FLOAT: 4~15 ; DOUBLE: 2~7 */
+    vld       $vr0,   X,       0
+    vld       $vr4,   Y,       0
+    addi.w    I,      I,       -1
+    addi.d    X,      X,       16
+    addi.d    Y,      Y,       16
+#ifdef DSDOT
+    vfcvtl.d.s        $vr10,   $vr0
+    vfcvtl.d.s        $vr11,   $vr4
+    vfcvth.d.s        $vr12,   $vr0
+    vfcvth.d.s        $vr13,   $vr4
+    vfmadd.d  $vr8,   $vr10,   $vr12,  $vr8
+    vfmadd.d  $vr9,   $vr11,   $vr13,  $vr9
+#else
+    VFMADD    $vr8,   $vr0,    $vr4,   $vr8
+#endif
+    bnez      I,      .L13
+    .align  3
+.L14:
+    /* store dot in s1 $f8 */
+#ifdef DSDOT
+    vfadd.d   $vr8,   $vr8,    $vr9
+    fsub.s    s2,     s2,      s2,  /* set s2 to 0.0 */
+    vpackod.d $vr0,   $vr8,    $vr8
+    vfadd.d   $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr9
+    SUB       s2,     s2,      s2   /* set s2 to 0.0 */
+    vpackod.d $vr0,   $vr8,    $vr8
+#ifdef DOUBLE
+    VFADD     $vr8,   $vr8,    $vr0
+#else
+    VFADD     $vr8,   $vr8,    $vr0
+    vpackod.w $vr0,   $vr8,    $vr8
+    VFADD     $vr8,   $vr8,    $vr0
+#endif /* defined DOUBLE */
+#endif /* defined DSDOT */
+    .align  3
+.L15:
+#ifdef DOUBLE
+    andi      I,      N,       0x1
+#else
+    andi      I,      N,       0x3
+#endif
+    bge       $r0,    I,       .L999  /* =0 */
+   .align  3
+.L16:
+    /* DOUBLE: 1 ; FLOAT: 1~3 */
+    LD        a1,     X,       0
+    LD        b1,     Y,       0
+#ifdef DSDOT
+    fcvt.d.s  a1,     a1
+    fcvt.d.s  b1,     b1
+    fmadd.d   s1,     b1,      a1,     s1
+#else
+    MADD      s1,     b1,      a1,     s1
+#endif
+    addi.d    I,      I,       -1
+    addi.d    X,      X,       SIZE
+    addi.d    Y,      Y,       SIZE
+    bnez      I,      .L16
+    b         .L999
+    .align 3
+
+.L20:
+/* !((inc_x == 1) && (inc_y == 1)) */
+    srai.d    I,      N,       3
+#ifdef F_INTERFACE
+    bgez    INCX, .L21
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCX
+    mflo    TEMP
+    dsub    X, X, TEMP
+    .align 3
+
+.L21:
+    bgez    INCY, .L22
+    addi.d  TEMP, N, -1
+    mult    TEMP, INCY
+    mflo    TEMP
+    dsub    Y, Y, TEMP
+    .align 3
+
+.L22:
+#endif
+    bge $r0,    I, .L25  /* <8 */
+    .align 3
+
+.L23:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s2, b1, a1, s2
+#else
+    MADD  s2, b1, a1, s2
+#endif
+    blt $r0,    I, .L23
+    .align 3
+
+.L25:
+    andi    I,  N, 7
+    bge $r0,    I, .L999
+    .align  3
+
+.L26:
+    LD a1,  X,   0 * SIZE
+    add.d   X, X, INCX
+    LD b1,  Y,   0 * SIZE
+    add.d   Y, Y, INCY
+    addi.d  I, I, -1
+#ifdef DSDOT
+    fcvt.d.s  a1, a1
+    fcvt.d.s  b1, b1
+    fmadd.d  s1, b1, a1, s1
+#else
+    MADD  s1, b1, a1, s1
+#endif
+    blt $r0,    I, .L26
+    .align 3
+
+.L999:
+#ifdef DSDOT
+    fadd.d    $f0,    s1,      s2
+#else
+    ADD       $f0,    s1,      s2
+#endif
+    move      $r4,    $r17
+    jirl      $r0,    $r1,     0x0
+
+EPILOGUE