LoongArch64: Opt cgemv with LASX

2024-02-22 10:46:45 +08:00
parent 99ef76f9bb
commit d51ffec3a2
4 changed files with 968 additions and 0 deletions
--- a/kernel/loongarch64/KERNEL.LOONGSON3R5
+++ b/kernel/loongarch64/KERNEL.LOONGSON3R5
@@ -121,6 +121,9 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
 CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
 CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
 CGEMVNKERNEL = cgemv_n_8_lasx.S
 CGEMVTKERNEL = cgemv_t_8_lasx.S
 CTRSMKERNEL_LN  = ../generic/trsm_kernel_LN.c
 CTRSMKERNEL_LT  = ../generic/trsm_kernel_LT.c
 CTRSMKERNEL_RN  = ../generic/trsm_kernel_RN.c
--- a/kernel/loongarch64/cgemv_n_8_lasx.S
+++ b/kernel/loongarch64/cgemv_n_8_lasx.S
@@ -0,0 +1,383 @@
 /*******************************************************************************
 Copyright (c) 2024, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #include "loongarch64_asm.S"
 /*********************************************************************
 * 2024/02/20 guxiwei
 *        UTEST                  : OK
 *        CTEST                  : OK
 *        TEST                   : OK
 *
 *
 *********************************************************************/
 /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 */
 #define M       $r4
 #define N       $r5
 #define ALPHA_R $f0
 #define ALPHA_I $f1
 #define A       $r7
 #define LDA     $r8
 #define X       $r9
 #define INC_X   $r10
 #define Y       $r11
 #define INC_Y   $r6
 #define J       $r12
 #define I       $r13
 #define K       $r14
 #define Y_ORG   $r15
 #define OFFSET  $r16
 #define K_LDA   $r17
 #define M8      $r18
 #define T0      $r19
 #define PA0     $r20
 #define PA1     $r23
 #define PA2     $r24
 #define PA3     $r25
 #define PA4     $r26
 #define PA5     $r27
 #define PA6     $r28
 #define PA7     $r29
 #define VALPHA  $xr1
 #define X0      $xr2
 #define X1      $xr3
 #define X2      $xr4
 #define X3      $xr5
 #define X4      $xr6
 #define X5      $xr7
 #define X6      $xr8
 #define X7      $xr9
 #define Y0      $xr10
 #define Y1      $xr11
 #define A0      $xr12
 #define A1      $xr13
 #define A2      $xr14
 #define A3      $xr15
 #define A4      $xr16
 #define A5      $xr17
 #define A6      $xr18
 #define A7      $xr19
 #define A8      $xr20
 #define A9      $xr21
 #define A10     $xr22
 #define A11     $xr23
 #define A12     $xr24
 #define A13     $xr25
 #define A14     $xr26
 #define A15     $xr27
 #define TMP0    $xr28
 #define TMP1    $xr29
 #define TMP2    $xr30
 #if !defined(CONJ)
 #if !defined(XCONJ)
 #define GXCONJ 0
 #define GCONJ  0
 #else
 #define GXCONJ 1
 #define GCONJ  0
 #endif
 #else
 #if !defined(XCONJ)
 #define GXCONJ 0
 #define GCONJ  1
 #else
 #define GXCONJ 1
 #define GCONJ  1
 #endif
 #endif
 .macro CLOAD_X_8
    GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
                   X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
    GCOMPLEXMUL GXCONJ, \
    xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
    X3, X3, VALPHA, TMP0, TMP1, TMP2, \
    X4, X4, VALPHA, TMP0, TMP1, TMP2, \
    X5, X5, VALPHA, TMP0, TMP1, TMP2, \
    X6, X6, VALPHA, TMP0, TMP1, TMP2, \
    X7, X7, VALPHA, TMP0, TMP1, TMP2
 .endm
 .macro CLOAD_X_8_GAP
    xvldrepl.d  X0,     X,      0x00
    PTR_ADD     T0,     X,      INC_X
    xvldrepl.d  X1,     T0,     0x00
    PTR_ADD     T0,     T0,     INC_X
    xvldrepl.d  X2,     T0,     0x00
    PTR_ADD     T0,     T0,     INC_X
    xvldrepl.d  X3,     T0,     0x00
    PTR_ADD     T0,     T0,     INC_X
    xvldrepl.d  X4,     T0,     0x00
    PTR_ADD     T0,     T0,     INC_X
    xvldrepl.d  X5,     T0,     0x00
    PTR_ADD     T0,     T0,     INC_X
    xvldrepl.d  X6,     T0,     0x00
    PTR_ADD     T0,     T0,     INC_X
    xvldrepl.d  X7,     T0,     0x00
    GCOMPLEXMUL GXCONJ, \
    xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
    X1, X1, VALPHA, TMP0, TMP1, TMP2, \
    X2, X2, VALPHA, TMP0, TMP1, TMP2, \
    X3, X3, VALPHA, TMP0, TMP1, TMP2, \
    X4, X4, VALPHA, TMP0, TMP1, TMP2, \
    X5, X5, VALPHA, TMP0, TMP1, TMP2, \
    X6, X6, VALPHA, TMP0, TMP1, TMP2, \
    X7, X7, VALPHA, TMP0, TMP1, TMP2
 .endm
 .macro CLOAD_Y_8
    GLD xv, , Y0, Y, 0, Y1, Y, 0x20
 .endm
 .macro CLOAD_Y_8_GAP
    fld.d   $f10,   Y,  0
    fldx.d  $f13,   Y,  INC_Y
    PTR_ALSL  T0,   INC_Y,  Y,  1
    fld.d   $f14,   T0, 0
    fldx.d  $f15,   T0, INC_Y
    PTR_ALSL  T0,   INC_Y,  Y,  2
    fld.d   $f11,   T0, 0
    fldx.d  $f17,   T0, INC_Y
    PTR_ADD   T0,   T0, INC_Y
    PTR_ADD   T0,   T0, INC_Y
    fld.d   $f18,   T0, 0
    fldx.d  $f19,   T0, INC_Y
    GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
 .endm
 .macro CSTORE_Y_8_GAP
    xvstelm.d   Y0,     Y,      0,      0
    PTR_ADD     T0,     Y,      INC_Y
    xvstelm.d   Y0,     T0,     0,      1
    PTR_ADD     T0,     T0,     INC_Y
    xvstelm.d   Y0,     T0,     0,      2
    PTR_ADD     T0,     T0,     INC_Y
    xvstelm.d   Y0,     T0,     0,      3
    PTR_ADD     T0,     T0,     INC_Y
    xvstelm.d   Y1,     T0,     0,      0
    PTR_ADD     T0,     T0,     INC_Y
    xvstelm.d   Y1,     T0,     0,      1
    PTR_ADD     T0,     T0,     INC_Y
    xvstelm.d   Y1,     T0,     0,      2
    PTR_ADD     T0,     T0,     INC_Y
    xvstelm.d   Y1,     T0,     0,      3
 .endm
 .macro CGEMV_N_8x8
    GLD_INC xv, , 0x20,       \
    A0,  PA0, 0, A1,  PA0, 0, \
    A2,  PA1, 0, A3,  PA1, 0, \
    A4,  PA2, 0, A5,  PA2, 0, \
    A6,  PA3, 0, A7,  PA3, 0, \
    A8,  PA4, 0, A9,  PA4, 0, \
    A10, PA5, 0, A11, PA5, 0, \
    A12, PA6, 0, A13, PA6, 0, \
    A14, PA7, 0, A15, PA7, 0
    GCOMPLEXMADD GXCONJ, GCONJ, \
    xvf, s, Y0, X0, A0,  Y0, TMP0, TMP1, TMP2, Y1, X0, A1,  Y1, TMP0, TMP1, TMP2, \
    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, Y1, X1, A3,  Y1, TMP0, TMP1, TMP2, \
    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, Y1, X2, A5,  Y1, TMP0, TMP1, TMP2, \
    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, Y1, X3, A7,  Y1, TMP0, TMP1, TMP2, \
    Y0, X4, A8,  Y0, TMP0, TMP1, TMP2, Y1, X4, A9,  Y1, TMP0, TMP1, TMP2, \
    Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \
    Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \
    Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2
 .endm
 .macro CSTORE_Y_8
    GST xv, , Y0, Y, 0, Y1, Y, 0x20
 .endm
 .macro CLOAD_X_1
    GLDREPL xv, d, X0, X, 0x00
    GCOMPLEXMUL GXCONJ, \
    xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
 .endm
 .macro CLOAD_Y_1
    fld.d  $f10,   Y,  0
 .endm
 .macro CGEMV_N_1x8
    GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
                        $f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
    GCOMPLEXMADD GXCONJ, GCONJ, \
    xvf, s, Y0, X0,  A0, Y0, TMP0, TMP1, TMP2, \
    Y0, X1, A2,  Y0, TMP0, TMP1, TMP2, \
    Y0, X2, A4,  Y0, TMP0, TMP1, TMP2, \
    Y0, X3, A6,  Y0, TMP0, TMP1, TMP2, \
    Y0, X4, A8,  Y0, TMP0, TMP1, TMP2, \
    Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \
    Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \
    Y0, X7, A14, Y0, TMP0, TMP1, TMP2
 .endm
 .macro CSTORE_Y_1
    fst.d   $f10,   Y,  0
 .endm
 .macro CGEMV_N_1x1
    fld.d   $f12,    PA0,    0
    PTR_ADDI PA0,   PA0,    0x08
    GCOMPLEXMADD GXCONJ, GCONJ, \
    xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
 .endm
 .macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req
    PTR_SRLI  J,      N,      3
    beqz      J,      .L_\XW\()_N_7
    PTR_SLLI  K_LDA,  LDA,    3
    PTR_SUB   K_LDA,  K_LDA,  M8
 .L_\XW\()_N_L8:
    CLOAD_\X_8
    xor     K,      K,      K
    move    Y,      Y_ORG
    PTR_SRLI  I,      M,       3
    beqz      I,      .L_\XW\()_M_7
 .align 5
 .L_\XW\()_M_L8:
    CLOAD_\Y_8
    CGEMV_N_8x8
    CSTORE_\Y_8
    PTR_ADDI    I,      I,      -1
    PTR_ALSL    Y,      INC_Y,  Y,  3
    PTR_ADDI    K,      K,      8
    bnez        I,      .L_\XW\()_M_L8
 .L_\XW\()_M_7:
    andi        I,      M,      7
    beqz        I,      .L_\XW\()_M_END
 .align 5
 .L_\XW\()_M_L1:
    CLOAD_\Y_1
    CGEMV_N_1x8
    CSTORE_\Y_1
    PTR_ADDI    I,      I,      -1
    PTR_ADD     Y,      Y,      INC_Y
    PTR_ADDI    K,      K,      1
    bnez        I,      .L_\XW\()_M_L1
 .L_\XW\()_M_END:
    PTR_ADDI    J,      J,      -1
 #if __loongarch_grlen == 64
    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
 #elif __loongarch_grlen == 32
    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
 #else
    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
 #endif
    PTR_ALSL    X,      INC_X,  X,  3
    bnez        J,      .L_\XW\()_N_L8
 .L_\XW\()_N_7:
    andi        J,      N,      7
    beqz        J,      .L_END
 .L_\XW\()_N_L1:
    CLOAD_\X_1
    xor     K,      K,      K
    move    Y,      Y_ORG
    move    I,      M
    beqz    I,      .L_END
 .align 5
 .L_\XW\()_N_1_M_L1:
    CLOAD_\Y_1
    CGEMV_N_1x1
    CSTORE_\Y_1
    PTR_ADDI  I,      I,      -1
    PTR_ADD   Y,      Y,      INC_Y
    PTR_ADDI  K,      K,      1
    bnez    I,      .L_\XW\()_N_1_M_L1
 .L_\XW\()_N_1_M_END:
    PTR_ADDI    J,      J,      -1
    PTR_SUB     K_LDA,  LDA,    M8
    PTR_ADD     PA0,    PA0,    K_LDA
    PTR_ADD     X,      X,      INC_X
    bnez        J,      .L_\XW\()_N_L1
    b .L_END
 .endm
    PROLOGUE
    PTR_LD     INC_Y,  $sp,    0
    push_if_used 17 + 7, 31
    PTR_ADDI   K,      $r0,     0x01
    PTR_SUB    I,      INC_X,   K
    PTR_SUB    J,      INC_Y,   K
    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
    maskeqz    J,      K,       J  /* if(inc_y == 1) j = 0; else j = 1; */
    PTR_ALSL   I,      I,       J,      1
    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
    // Init VALPHA
    xvpackev.w      $xr0,   $xr1,   $xr0
    xvreplve0.d     VALPHA, $xr0
    move     Y_ORG,  Y
    move     PA0,    A
 #if __loongarch_grlen == 64
    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
 #elif __loongarch_grlen == 32
    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
 #else
    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
 #endif
    la.local    T0,     .L_GAP_TABLE
    PTR_ALSL    I,      I,      T0,     1
    ld.h        K,      I,      0 // Obtain the offset address
    PTR_ADD     T0,     T0,     K
    jirl        $r0,    T0,     0
 .L_GAP_TABLE:
    .hword  .L_GAP_0_0 - .L_GAP_TABLE
    .hword  .L_GAP_0_1 - .L_GAP_TABLE
    .hword  .L_GAP_1_0 - .L_GAP_TABLE
    .hword  .L_GAP_1_1 - .L_GAP_TABLE
 .L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
    CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1
 .L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
    CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1
 .L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
    CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1
 .L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
    CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
 .L_END:
    pop_if_used 17 + 7, 31
    jirl    $r0, $r1, 0x0
    EPILOGUE
--- a/kernel/loongarch64/cgemv_t_8_lasx.S
+++ b/kernel/loongarch64/cgemv_t_8_lasx.S
@@ -0,0 +1,342 @@
 /*******************************************************************************
 Copyright (c) 2024, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *******************************************************************************/
 #define ASSEMBLER
 #include "common.h"
 #include "loongarch64_asm.S"
 /*********************************************************************
 * 2022/02/20 guxiwei
 *        UTEST                  : OK
 *        CTEST                  : OK
 *        TEST                   : OK
 *
 *
 *********************************************************************/
 /* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
 * FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 */
 #define M       $r4
 #define N       $r5
 #define ALPHA_R $f0
 #define ALPHA_I $f1
 #define A       $r7
 #define LDA     $r8
 #define X       $r9
 #define INC_X   $r10
 #define Y       $r11
 #define INC_Y   $r6
 #define J       $r12
 #define I       $r13
 #define K       $r14
 #define PY0     $r14
 #define X_ORG   $r15
 #define PY1     $r16
 #define K_LDA   $r17
 #define PY2     $r18
 #define T0      $r19
 #define PA0     $r20
 #define PA1     $r23
 #define PA2     $r24
 #define PA3     $r25
 #define PA4     $r26
 #define PA5     $r27
 #define PA6     $r28
 #define PA7     $r29
 #define M8      $r30
 #define VALPHA  $xr0
 #define X0      $xr1
 #define X1      $xr2
 #define A0      $xr3
 #define A1      $xr4
 #define A2      $xr5
 #define A3      $xr6
 #define A4      $xr7
 #define A5      $xr8
 #define A6      $xr9
 #define A7      $xr10
 #define A8      $xr11
 #define A9      $xr12
 #define A10     $xr13
 #define A11     $xr14
 #define A12     $xr15
 #define A13     $xr16
 #define A14     $xr17
 #define A15     $xr18
 #define TP0     $xr19
 #define TP1     $xr20
 #define TP2     $xr21
 #define TP3     $xr22
 #define TP4     $xr23
 #define TP5     $xr24
 #define TP6     $xr25
 #define TP7     $xr26
 #define TMP0    $xr27
 #define TMP1    $xr28
 #define TMP2    $xr29
 #define Y0      $xr3
 #define Y1      $xr4
 #define Y2      $xr5
 #define Y3      $xr6
 #define Y4      $xr7
 #define Y5      $xr8
 #define Y6      $xr9
 #define Y7      $xr10
 #if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
 #define GXCONJ1 0
 #define GCONJ1  0
 #else
 #define GXCONJ1 1
 #define GCONJ1  0
 #endif
 #if !defined(XCONJ)
 #define GXCONJ2 0
 #define GCONJ2  0
 #else
 #define GXCONJ2 0
 #define GCONJ2  1
 #endif
 .macro ZERO_Y8
    GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
                TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
 .endm
 .macro ZERO_Y1
    GXOR xv, v, TP0, TP0, TP0
 .endm
 .macro CLOAD_X8
    GLD xv, , X0, X, 0x00, X1, X, 0x20
 .endm
 .macro CLOAD_X8_GAP
    fld.d       $f1,    X,    0x00
    fldx.d      $f2,    X,    INC_X
    PTR_ALSL    T0,     INC_X,      X,      1
    fld.d       $f3,    T0,   0x00
    fldx.d      $f4,    T0,   INC_X
    GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
    PTR_ALSL    T0,     INC_X,      X,      2
    fld.d       $f2,    T0,   0x00
    fldx.d      $f3,    T0,   INC_X
    PTR_ALSL    T0,     INC_X,      T0,     1
    fld.d       $f4,    T0,   0x00
    fldx.d      $f5,    T0,   INC_X
    GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
 .endm
 .macro CGEMV_T_8x8
    GLD_INC xv, , 0x20,       \
    A0,  PA0, 0, A1,  PA0, 0, \
    A2,  PA1, 0, A3,  PA1, 0, \
    A4,  PA2, 0, A5,  PA2, 0, \
    A6,  PA3, 0, A7,  PA3, 0, \
    A8,  PA4, 0, A9,  PA4, 0, \
    A10, PA5, 0, A11, PA5, 0, \
    A12, PA6, 0, A13, PA6, 0, \
    A14, PA7, 0, A15, PA7, 0
    GCOMPLEXMADD GXCONJ1, GCONJ1, \
    xvf, s, TP0, A0,  X0, TP0, TMP0, TMP1, TMP2, TP0, A1,  X1, TP0, TMP0, TMP1, TMP2, \
    TP1, A2,  X0, TP1, TMP0, TMP1, TMP2, TP1, A3,  X1, TP1, TMP0, TMP1, TMP2, \
    TP2, A4,  X0, TP2, TMP0, TMP1, TMP2, TP2, A5,  X1, TP2, TMP0, TMP1, TMP2, \
    TP3, A6,  X0, TP3, TMP0, TMP1, TMP2, TP3, A7,  X1, TP3, TMP0, TMP1, TMP2, \
    TP4, A8,  X0, TP4, TMP0, TMP1, TMP2, TP4, A9,  X1, TP4, TMP0, TMP1, TMP2, \
    TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \
    TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \
    TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2
 .endm
 .macro CGEMV_T_LASX XW:req, X8:req
    PTR_SRLI  J,      N,      3
    beqz      J,      .L_\XW\()_N_7
    PTR_SLLI  K_LDA,  LDA,    3
    PTR_SUB   K_LDA,  K_LDA,  M8
 .L_\XW\()_N_L8:
    ZERO_Y8
    move      X,      X_ORG
    PTR_SRLI  I,      M,       3
    beqz      I,      .L_\XW\()_M_7
 .align 5
 .L_\XW\()_M_L8:
    CLOAD_\X8
    CGEMV_T_8x8
    PTR_ADDI    I,  I,  -1
    PTR_ALSL    X,  INC_X,  X,  3
    bnez        I,  .L_\XW\()_M_L8
 .L_\XW\()_M_7:
    // Accumulated
    GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
                        Y5, TP5, Y6, TP6, Y7, TP7
    andi        I,      M,      7
    beqz        I,      .L_\XW\()_M_END
 .align 5
 .L_\XW\()_M_L1:
    fld.d   $f1,    X,      0x00
    fld.d   $f11,   PA0,    0x00
    fld.d   $f12,   PA1,    0x00
    fld.d   $f13,   PA2,    0x00
    fld.d   $f14,   PA3,    0x00
    fld.d   $f15,   PA4,    0x00
    fld.d   $f16,   PA5,    0x00
    fld.d   $f17,   PA6,    0x00
    fld.d   $f18,   PA7,    0x00
 #if __loongarch_grlen == 64
    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
 #elif __loongarch_grlen == 32
    GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
 #else
    GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
               PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
 #endif
    GCOMPLEXMADD GXCONJ1, GCONJ1, \
    xvf, s, A0, A8,  X0, A0, TMP0, TMP1, TMP2, A1, A9,  X0, A1, TMP0, TMP1, TMP2, \
    A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \
    A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \
    A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2
    PTR_ADDI  I,      I,      -1
    PTR_ADD   X,      X,      INC_X
    bnez      I,      .L_\XW\()_M_L1
 .L_\XW\()_M_END:
    fld.d   $f11,   Y,  0x00
    fldx.d  $f12,   Y,  INC_Y
    PTR_ALSL    PY0, INC_Y,  Y,  1
    fld.d   $f13,   PY0,    0x00
    fldx.d  $f14,   PY0,    INC_Y
    PTR_ALSL    PY1, INC_Y,  Y,  2
    fld.d   $f15,   PY1,    0x00
    fldx.d  $f16,   PY1,    INC_Y
    PTR_ALSL    PY2, INC_Y,  PY1, 1
    fld.d   $f17,   PY2,    0x00
    fldx.d  $f18,   PY2,    INC_Y
    GCOMPLEXMADD GXCONJ2, GCONJ2, \
    xvf, s, A8,  VALPHA, A0, A8,  TMP0, TMP1, TMP2, A9,  VALPHA, A1, A9,  TMP0, TMP1, TMP2,\
    A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\
    A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\
    A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2
    PTR_ADDI    J,      J,      -1
 #if __loongarch_grlen == 64
    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
 #elif __loongarch_grlen == 32
    GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
 #else
    GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
              PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
 #endif
    fst.d   $f11,   Y,      0x00
    fstx.d  $f12,   Y,      INC_Y
    fst.d   $f13,   PY0,    0x00
    fstx.d  $f14,   PY0,    INC_Y
    fst.d   $f15,   PY1,    0x00
    fstx.d  $f16,   PY1,    INC_Y
    fst.d   $f17,   PY2,    0x00
    fstx.d  $f18,   PY2,    INC_Y
    PTR_ALSL    Y,      INC_Y,  Y,  3
    bnez        J,      .L_\XW\()_N_L8
 .L_\XW\()_N_7:
    andi        J,      N,      7
    beqz        J,      .L_END
    PTR_SUB     K_LDA,  LDA,    M8
 .L_\XW\()_N_1:
    ZERO_Y1
    move    X,      X_ORG
    move    I,      M
    beqz    I,      .L_END
 .align 5
 .L_\XW\()_N_1_M_L1:
    fld.d   $f3,    PA0,    0x00
    fld.d   $f1,    X,      0x00
    GCOMPLEXMADD GXCONJ1, GCONJ1, \
    xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
    PTR_ADDI  I,      I,      -1
    PTR_ADD   X,      X,      INC_X
    PTR_ADDI  PA0,    PA0,    0x08
    bnez      I,      .L_\XW\()_N_1_M_L1
 .L_\XW\()_N_1_M_END:
    PTR_ADDI  J,      J,      -1
    fld.d     $f3,    Y,      0x00
    GCOMPLEXMADD GXCONJ2, GCONJ2, \
    xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
    fst.d     $f3,    Y,      0x00
    PTR_ADD   PA0,    PA0,    K_LDA
    PTR_ADD   Y,      Y,      INC_Y
    bnez      J,      .L_\XW\()_N_1
    b .L_END
 .endm
    PROLOGUE
    PTR_LD     INC_Y,  $sp,    0
    push_if_used 17 + 8, 30
    PTR_ADDI   K,      $r0,     0x01
    PTR_SUB    I,      INC_X,   K
    maskeqz    I,      K,       I  /* if(inc_x == 1) I = 0; else I = 1; */
    GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
    // Init VALPHA
    xvpackev.w      $xr0,   $xr1,   $xr0
    xvreplve0.d     VALPHA, $xr0
    move     X_ORG,  X
    move     PA0,    A
 #if __loongarch_grlen == 64
    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
 #elif __loongarch_grlen == 32
    GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
 #else
    GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
              PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
 #endif
    la.local    T0,     .L_GAP_TABLE
    PTR_ALSL    I,      I,      T0,     1
    ld.h        K,      I,      0
    PTR_ADD     T0,     T0,     K
    jirl        $r0,    T0,     0
 .L_GAP_TABLE:
    .hword .L_GAP_0 - .L_GAP_TABLE
    .hword .L_GAP_1 - .L_GAP_TABLE
 .L_GAP_0: /* if (incx == 1) */
    CGEMV_T_LASX GAP_0, X8
 .L_GAP_1: /* if (incx != 1) */
    CGEMV_T_LASX GAP_1, X8_GAP
 .L_END:
    pop_if_used 17 + 8, 30
    jirl    $r0, $r1, 0x0
    EPILOGUE
--- a/kernel/loongarch64/loongarch64_asm.S
+++ b/kernel/loongarch64/loongarch64_asm.S
@@ -384,6 +384,246 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 .endif
 .endm
 //
 // GCOMPLEXACC: Complex accumulate the values of vector registers
 // pre_op: xvf or vf, differentiate between LSX or LASX instruction
 // suf_op: s or d, differentiate between single precision or double precision complex numbers
 // Note: When "pre_op = xvf && suf_op = s", in will be modified.
 //
 .macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
 .ifeqs "\pre_op", "xvf"
    xvpermi.q              \out,   \in,    0x01
 .ifeqs "\suf_op", "s"
    \pre_op\()add.\suf_op  \in,    \out,   \in
    xvpackod.d             \out,   \in,    \in
    \pre_op\()add.\suf_op  \out,   \out,   \in
 .else
    \pre_op\()add.\suf_op  \out,   \out,   \in
 .endif
 .endif
 .ifeqs "\pre_op", "vf"
 .ifeqs "\suf_op", "s"
    vpackod.d              \out,   \in,    \in
    \pre_op\()add.\suf_op  \out,   \out,   \in
 .endif
 .endif
 .ifnb \more
    GCOMPLEXACC \pre_op, \suf_op, \more
 .endif
 .endm
 //
 // GCOMPLEXMUL: Complex multiplication, out = in0 * in1
 // xconj: default value 0.
 // if !(xconj)
 //     out_r = in0_r * in1_r - in0_i * in1_i;
 //     out_i = in0_r * in1_i + in0_i * in1_r;
 // else
 //     out_r = in0_r * in1_r + in0_i * in1_i;
 //     out_i = in0_r * in1_i - in0_i * in1_r;
 // pre_op: xvf or vf, differentiate between LSX or LASX instruction
 // suf_op: s or d, differentiate between single precision or double precision complex numbers
 //
 .macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
 .ifeqs "\pre_op", "xvf"
    xvxor.v           \tmp1,     \tmp1,     \tmp1
 .ifeqs "\suf_op", "s"
    xvpackev.w  \tmp0,     \in0,      \in0
 .else
    xvpackev.d  \tmp0,     \in0,      \in0
 .endif
 .else
    vxor.v           \tmp1,     \tmp1,     \tmp1
 .ifeqs "\suf_op", "s"
    vpackev.w  \tmp0,     \in0,      \in0
 .else
    vpackev.d  \tmp0,     \in0,      \in0
 .endif
 .endif
    \pre_op\()sub.\suf_op    \tmp1,     \tmp1,     \in0
 .ifeqs "\pre_op", "xvf"
 .ifeqs "\suf_op", "s"
 .ifeqs "\xconj", "0"
    xvpackod.w  \tmp1,     \in0,      \tmp1
 .else
    xvpackod.w  \tmp1,     \tmp1,     \in0
 .endif
    xvshuf4i.w  \tmp2,     \in1,      0xb1
 .else
 .ifeqs "\xconj", "0"
    xvpackod.d  \tmp1,     \in0,      \tmp1
 .else
    xvpackod.d  \tmp1,     \tmp1,     \in0
 .endif
    xvshuf4i.d  \tmp2,     \in1,      0x0b
 .endif
 .else
 .ifeqs "\suf_op", "s"
 .ifeqs "\xconj", "0"
    vpackod.w  \tmp1,     \in0,      \tmp1
 .else
    vpackod.w  \tmp1,     \tmp1,     \in0
 .endif
    vshuf4i.w  \tmp2,     \in1,      0xb1
 .else
 .ifeqs "\xconj", "0"
    vpackod.d  \tmp1,     \in0,      \tmp1
 .else
    vpackod.d  \tmp1,     \tmp1,     \in0
 .endif
    vshuf4i.d  \tmp2,     \in1,      0x0b
 .endif
 .endif
    \pre_op\()mul.\suf_op    \out,      \tmp0,     \in1
    \pre_op\()madd.\suf_op   \out,      \tmp1,     \tmp2,     \out
 .ifnb \more
    GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more
 .endif
 .endm
 //
 // GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2
 // xconj: default value 0
 // conj: default value 0
 // if !(CONJ)
 // if !(XCONJ)
 //     out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
 //     out_i = in0_r * in1_i + in0_i * in1_r + in2_i;
 // else
 //     out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
 //     out_i = in0_r * in1_i - in0_i * in1_r + in2_i;
 // else
 // if !(XCONJ)
 //     out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
 //     out_i = in2_i - (in0_r * in1_i - in0_i * in1_r);
 // else
 //     out_r = in0_r * in1_r   - in0_i * in1_i + in2_r;
 //     out_i = in2_i - (in0_r * in1_i + in0_i * in1_r);
 // pre_op: xvf or vf, differentiate between LSX or LASX instruction
 // suf_op: s or d, differentiate between single precision or double precision complex numbers
 //
 .macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
 .ifeqs "\pre_op", "xvf"
    xvxor.v           \tmp1,     \tmp1,     \tmp1
 .ifeqs "\suf_op", "s"
    xvpackev.w  \tmp0,     \in0,      \in0
 .else
    xvpackev.d  \tmp0,     \in0,      \in0
 .endif
 .else
    vxor.v           \tmp1,     \tmp1,     \tmp1
 .ifeqs "\suf_op", "s"
    vpackev.w  \tmp0,     \in0,      \in0
 .else
    vpackev.d  \tmp0,     \in0,      \in0
 .endif
 .endif
    \pre_op\()madd.\suf_op   \tmp2,      \tmp0,     \in1,      \in2
 .ifeqs "\conj", "1"
    \pre_op\()nmsub.\suf_op  \tmp0,      \tmp0,     \in1,      \in2
 .ifeqs "\pre_op", "xvf"
 .ifeqs "\suf_op", "s"
    xvshuf4i.w  \tmp0,     \tmp0,      0xb1
    xvpackev.w  \out,      \tmp0,      \tmp2
 .else
    xvshuf4i.d  \tmp0,     \tmp0,      0x0b
    xvpackev.d  \out,      \tmp0,      \tmp2
 .endif
 .else
 .ifeqs "\suf_op", "s"
    vshuf4i.w  \tmp0,     \tmp0,      0xb1
    vpackev.w  \out,      \tmp0,      \tmp2
 .else
    vshuf4i.d  \tmp0,     \tmp0,      0x0b
    vpackev.d  \out,      \tmp0,      \tmp2
 .endif
 .endif /* pre_op = xvf */
 .else
    \pre_op\()add.\suf_op    \out,     \tmp2,     \tmp1
 .endif /* conj = 1 */
    \pre_op\()sub.\suf_op    \tmp1,     \tmp1,     \in0
 .ifeqs "\pre_op", "xvf"
 .ifeqs "\suf_op", "s"
 .ifeqs "\conj",  "0"
 .ifeqs "\xconj", "0"
    xvpackod.w  \tmp1,     \in0,      \tmp1
 .else
    xvpackod.w  \tmp1,     \tmp1,     \in0
 .endif
 .else
 .ifeqs "\xconj", "0"
    xvpackod.w  \tmp1,     \in0,     \in0
 .else
    xvpackod.w  \tmp1,     \tmp1,     \tmp1
 .endif
 .endif
    xvshuf4i.w  \tmp2,     \in1,      0xb1
 .else
 .ifeqs "\conj",  "0"
 .ifeqs "\xconj", "0"
    xvpackod.d  \tmp1,     \in0,      \tmp1
 .else
    xvpackod.d  \tmp1,     \tmp1,     \in0
 .endif
 .else
 .ifeqs "\xconj", "0"
    xvpackod.d  \tmp1,     \in0,     \in0
 .else
    xvpackod.d  \tmp1,     \tmp1,    \tmp1
 .endif
 .endif
    xvshuf4i.d  \tmp2,     \in1,      0x0b
 .endif
 .else
 .ifeqs "\suf_op", "s"
 .ifeqs "\conj",  "0"
 .ifeqs "\xconj", "0"
    vpackod.w  \tmp1,     \in0,      \tmp1
 .else
    vpackod.w  \tmp1,     \tmp1,     \in0
 .endif
 .else
 .ifeqs "\xconj", "0"
    vpackod.w  \tmp1,     \in0,     \in0
 .else
    vpackod.w  \tmp1,     \tmp1,    \tmp1
 .endif
 .endif
    vshuf4i.w  \tmp2,     \in1,      0xb1
 .else
 .ifeqs "\conj",  "0"
 .ifeqs "\xconj", "0"
    vpackod.d  \tmp1,     \in0,      \tmp1
 .else
    vpackod.d  \tmp1,     \tmp1,     \in0
 .endif
 .else
 .ifeqs "\xconj", "0"
    vpackod.d  \tmp1,     \in0,     \in0
 .else
    vpackod.d  \tmp1,     \tmp1,    \tmp1
 .endif
 .endif
    vshuf4i.d  \tmp2,     \in1,      0x0b
 .endif
 .endif
    \pre_op\()madd.\suf_op   \out,      \tmp1,     \tmp2,     \out
 .ifnb \more
    GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more
 .endif
 .endm
 //
 // Media Related Macros
 //