Merge pull request #4536 from XiWeiGu/loongarch64-cgemv-zgemv-opt

Loongarch64 cgemv zgemv opt
This commit is contained in:
Martin Kroeker 2024-02-28 10:15:34 +01:00 committed by GitHub
commit cfbb701497
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
8 changed files with 1212 additions and 26 deletions

View File

@ -100,6 +100,9 @@ DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
DTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
CGEMVNKERNEL = cgemv_n_4_lsx.S
CGEMVTKERNEL = cgemv_t_4_lsx.S
CGEMMKERNEL = cgemm_kernel_8x4_lsx.S
CGEMMINCOPY = cgemm_ncopy_8_lsx.S
CGEMMITCOPY = cgemm_tcopy_8_lsx.S
@ -115,6 +118,9 @@ CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
CTRSMKERNEL_RT = ../generic/trsm_kernel_RT.c
ZGEMVNKERNEL = zgemv_n_2_lsx.S
ZGEMVTKERNEL = zgemv_t_2_lsx.S
ZGEMMKERNEL = zgemm_kernel_4x4_lsx.S
ZGEMMONCOPY = zgemm_ncopy_4_lsx.S
ZGEMMOTCOPY = zgemm_tcopy_4_lsx.S

View File

@ -0,0 +1,323 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define Y_ORG $r15
#define OFFSET $r16
#define K_LDA $r17
#define M8 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define VALPHA $vr1
#define X0 $vr2
#define X1 $vr3
#define X2 $vr4
#define X3 $vr5
#define X4 $vr6
#define X5 $vr7
#define X6 $vr8
#define X7 $vr9
#define Y0 $vr10
#define Y1 $vr11
#define A0 $vr12
#define A1 $vr13
#define A2 $vr14
#define A3 $vr15
#define A4 $vr16
#define A5 $vr17
#define A6 $vr18
#define A7 $vr19
#define A8 $vr20
#define A9 $vr21
#define A10 $vr22
#define A11 $vr23
#define A12 $vr24
#define A13 $vr25
#define A14 $vr26
#define A15 $vr27
#define TMP0 $vr28
#define TMP1 $vr29
#define TMP2 $vr30
#if !defined(CONJ)
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 0
#else
#define GXCONJ 1
#define GCONJ 0
#endif
#else
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 1
#else
#define GXCONJ 1
#define GCONJ 1
#endif
#endif
.macro CLOAD_X_4
GLDREPL v, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
GCOMPLEXMUL GXCONJ, \
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2
.endm
.macro CLOAD_X_4_GAP
vldrepl.d X0, X, 0x00
PTR_ADD T0, X, INC_X
vldrepl.d X1, T0, 0x00
PTR_ADD T0, T0, INC_X
vldrepl.d X2, T0, 0x00
PTR_ADD T0, T0, INC_X
vldrepl.d X3, T0, 0x00
GCOMPLEXMUL GXCONJ, \
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2
.endm
.macro CLOAD_X_1
GLDREPL v, d, X0, X, 0x00
GCOMPLEXMUL GXCONJ, \
vf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
.endm
.macro CLOAD_Y_4
GLD v, , Y0, Y, 0, Y1, Y, 0x10
.endm
.macro CLOAD_Y_4_GAP
fld.d $f10, Y, 0
fldx.d $f13, Y, INC_Y
PTR_ALSL T0, INC_Y, Y, 1
fld.d $f11, T0, 0
fldx.d $f17, T0, INC_Y
vpackev.d Y0, A1, Y0
vpackev.d Y1, A5, Y1
.endm
.macro CLOAD_Y_1
fld.d $f10, Y, 0
.endm
.macro CSTORE_Y_4
GST v, , Y0, Y, 0, Y1, Y, 0x10
.endm
.macro CSTORE_Y_4_GAP
vstelm.d Y0, Y, 0, 0
PTR_ADD T0, Y, INC_Y
vstelm.d Y0, T0, 0, 1
PTR_ADD T0, T0, INC_Y
vstelm.d Y1, T0, 0, 0
PTR_ADD T0, T0, INC_Y
vstelm.d Y1, T0, 0, 1
.endm
.macro CSTORE_Y_1
fst.d $f10, Y, 0
.endm
.macro CGEMV_N_4x4
GLD_INC v, , 0x10, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2
.endm
.macro CGEMV_N_1x4
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
Y0, X3, A6, Y0, TMP0, TMP1, TMP2
.endm
.macro CGEMV_N_1x1
fld.d $f12, PA0, 0
PTR_ADDI PA0, PA0, 0x08
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
.endm
.macro CGEMV_N_LSX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
PTR_SRLI J, N, 2
beqz J, .L_\XW\()_N_3
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L4:
CLOAD_\X_4
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 2
beqz I, .L_\XW\()_M_3
.align 5
.L_\XW\()_M_L4:
CLOAD_\Y_4
CGEMV_N_4x4
CSTORE_\Y_4
PTR_ADDI I, I, -1
PTR_ALSL Y, INC_Y, Y, 2
PTR_ADDI K, K, 4
bnez I, .L_\XW\()_M_L4
.L_\XW\()_M_3:
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
CLOAD_\Y_1
CGEMV_N_1x4
CSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
PTR_ALSL X, INC_X, X, 2
bnez J, .L_\XW\()_N_L4
.L_\XW\()_N_3:
andi J, N, 3
beqz J, .L_END
.L_\XW\()_N_L1:
CLOAD_\X_1
xor K, K, K
move Y, Y_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
CLOAD_\Y_1
CGEMV_N_1x1
CSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
PTR_SUB K_LDA, LDA, M8
PTR_ADD PA0, PA0, K_LDA
PTR_ADD X, X, INC_X
bnez J, .L_\XW\()_N_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
PTR_ALSL I, I, J, 1
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
// Init VALPHA
vpackev.w $vr0, $vr1, $vr0
vpackev.d VALPHA, $vr0, $vr0
move Y_ORG, Y
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#else
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0 // Obtain the offset address
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0_0 - .L_GAP_TABLE
.hword .L_GAP_0_1 - .L_GAP_TABLE
.hword .L_GAP_1_0 - .L_GAP_TABLE
.hword .L_GAP_1_1 - .L_GAP_TABLE
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
CGEMV_N_LSX GAP_0_0, X_4, X_1, Y_4, Y_1
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
CGEMV_N_LSX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
CGEMV_N_LSX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
CGEMV_N_LSX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -122,14 +122,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
GCOMPLEXMUL GXCONJ, \
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
X7, X7, VALPHA, TMP0, TMP1, TMP2
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2, \
X4, VALPHA, X4, TMP0, TMP1, TMP2, \
X5, VALPHA, X5, TMP0, TMP1, TMP2, \
X6, VALPHA, X6, TMP0, TMP1, TMP2, \
X7, VALPHA, X7, TMP0, TMP1, TMP2
.endm
.macro CLOAD_X_8_GAP
@ -150,14 +150,14 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvldrepl.d X7, T0, 0x00
GCOMPLEXMUL GXCONJ, \
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
X7, X7, VALPHA, TMP0, TMP1, TMP2
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2, \
X4, VALPHA, X4, TMP0, TMP1, TMP2, \
X5, VALPHA, X5, TMP0, TMP1, TMP2, \
X6, VALPHA, X6, TMP0, TMP1, TMP2, \
X7, VALPHA, X7, TMP0, TMP1, TMP2
.endm
.macro CLOAD_Y_8
@ -228,7 +228,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.macro CLOAD_X_1
GLDREPL xv, d, X0, X, 0x00
GCOMPLEXMUL GXCONJ, \
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
xvf, s, X0, VALPHA, X0, TMP0, TMP1, TMP2
.endm
.macro CLOAD_Y_1

View File

@ -0,0 +1,290 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define PY0 $r14
#define X_ORG $r15
#define PY1 $r16
#define K_LDA $r17
#define PY2 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define M8 $r30
#define VALPHA $vr0
#define X0 $vr1
#define X1 $vr2
#define A0 $vr3
#define A1 $vr4
#define A2 $vr5
#define A3 $vr6
#define A4 $vr7
#define A5 $vr8
#define A6 $vr9
#define A7 $vr10
#define A8 $vr11
#define A9 $vr12
#define A10 $vr13
#define A11 $vr14
#define A12 $vr15
#define A13 $vr16
#define A14 $vr17
#define A15 $vr18
#define TP0 $vr19
#define TP1 $vr20
#define TP2 $vr21
#define TP3 $vr22
#define TP4 $vr23
#define TP5 $vr24
#define TP6 $vr25
#define TP7 $vr26
#define TMP0 $vr27
#define TMP1 $vr28
#define TMP2 $vr29
#define Y0 $vr3
#define Y1 $vr4
#define Y2 $vr5
#define Y3 $vr6
#define Y4 $vr7
#define Y5 $vr8
#define Y6 $vr9
#define Y7 $vr10
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
#define GXCONJ1 0
#define GCONJ1 0
#else
#define GXCONJ1 1
#define GCONJ1 0
#endif
#if !defined(XCONJ)
#define GXCONJ2 0
#define GCONJ2 0
#else
#define GXCONJ2 0
#define GCONJ2 1
#endif
.macro ZERO_Y4
GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
.endm
.macro ZERO_Y1
GXOR v, v, TP0, TP0, TP0
.endm
.macro CLOAD_X4
GLD v, , X0, X, 0x00, X1, X, 0x10
.endm
.macro CLOAD_X4_GAP
fld.d $f1, X, 0x00
fldx.d $f3, X, INC_X
PTR_ALSL T0, INC_X, X, 1
fld.d $f2, T0, 0x00
fldx.d $f4, T0, INC_X
vpackev.d X0, A0, X0
vpackev.d X1, A1, X1
.endm
.macro CGEMV_T_4x4
GLD_INC v, , 0x10, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2
.endm
.macro CGEMV_T_LSX XW:req, X4:req
PTR_SRLI J, N, 2
beqz J, .L_\XW\()_N_3
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L4:
ZERO_Y4
move X, X_ORG
PTR_SRLI I, M, 2
beqz I, .L_\XW\()_M_3
.align 5
.L_\XW\()_M_L4:
CLOAD_\X4
CGEMV_T_4x4
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 2
bnez I, .L_\XW\()_M_L4
.L_\XW\()_M_3:
// Accumulated
GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
fld.d $f1, X, 0x00
fld.d $f11, PA0, 0x00
fld.d $f12, PA1, 0x00
fld.d $f13, PA2, 0x00
fld.d $f14, PA3, 0x00
#if __loongarch_grlen == 64
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
#elif __loongarch_grlen == 32
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
#else
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
#endif
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
fld.d $f11, Y, 0x00
fldx.d $f12, Y, INC_Y
PTR_ALSL PY0, INC_Y, Y, 1
fld.d $f13, PY0, 0x00
fldx.d $f14, PY0, INC_Y
GCOMPLEXMADD GXCONJ2, GCONJ2, \
vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
fst.d $f11, Y, 0x00
fstx.d $f12, Y, INC_Y
fst.d $f13, PY0, 0x00
fstx.d $f14, PY0, INC_Y
PTR_ALSL Y, INC_Y, Y, 2
bnez J, .L_\XW\()_N_L4
.L_\XW\()_N_3:
andi J, N, 3
beqz J, .L_END
PTR_SUB K_LDA, LDA, M8
.L_\XW\()_N_1:
ZERO_Y1
move X, X_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
fld.d $f3, PA0, 0x00
fld.d $f1, X, 0x00
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
PTR_ADDI PA0, PA0, 0x08
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
fld.d $f3, Y, 0x00
GCOMPLEXMADD GXCONJ2, GCONJ2, \
vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
fst.d $f3, Y, 0x00
PTR_ADD PA0, PA0, K_LDA
PTR_ADD Y, Y, INC_Y
bnez J, .L_\XW\()_N_1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
// Init VALPHA
vpackev.w $vr0, $vr1, $vr0
vpackev.d VALPHA, $vr0, $vr0
move X_ORG, X
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#else
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0 - .L_GAP_TABLE
.hword .L_GAP_1 - .L_GAP_TABLE
.L_GAP_0: /* if (incx == 1) */
CGEMV_T_LSX GAP_0, X4
.L_GAP_1: /* if (incx != 1) */
CGEMV_T_LSX GAP_1, X4_GAP
.L_END:
pop_if_used 17 + 8, 30
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -406,9 +406,12 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
.ifeqs "\suf_op", "s"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.else
vor.v \out, \in, \in
.endif
.endif
.ifnb \more
GCOMPLEXACC \pre_op, \suf_op, \more
.endif

View File

@ -0,0 +1,296 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define Y_ORG $r15
#define OFFSET $r16
#define K_LDA $r17
#define M16 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define VALPHA $vr1
#define X0 $vr2
#define X1 $vr3
#define X2 $vr4
#define X3 $vr5
#define X4 $vr6
#define X5 $vr7
#define X6 $vr8
#define X7 $vr9
#define Y0 $vr10
#define Y1 $vr11
#define A0 $vr12
#define A1 $vr13
#define A2 $vr14
#define A3 $vr15
#define A4 $vr16
#define A5 $vr17
#define A6 $vr18
#define A7 $vr19
#define A8 $vr20
#define A9 $vr21
#define A10 $vr22
#define A11 $vr23
#define A12 $vr24
#define A13 $vr25
#define A14 $vr26
#define A15 $vr27
#define TMP0 $vr28
#define TMP1 $vr29
#define TMP2 $vr30
#if !defined(CONJ)
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 0
#else
#define GXCONJ 1
#define GCONJ 0
#endif
#else
#if !defined(XCONJ)
#define GXCONJ 0
#define GCONJ 1
#else
#define GXCONJ 1
#define GCONJ 1
#endif
#endif
.macro ZLOAD_X_2
GLD v, , X0, X, 0x00, X1, X, 0x10
GCOMPLEXMUL GXCONJ, \
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_X_2_GAP
vld X0, X, 0
PTR_ADD T0, X, INC_X
vld X1, T0, 0
GCOMPLEXMUL GXCONJ, \
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_X_1
GLD v, , X0, X, 0x00
GCOMPLEXMUL GXCONJ, \
vf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_Y_2
GLD v, , Y0, Y, 0, Y1, Y, 0x10
.endm
.macro ZLOAD_Y_2_GAP
vld $vr10, Y, 0
vldx $vr11, Y, INC_Y
.endm
.macro ZLOAD_Y_1
vld $vr10, Y, 0
.endm
.macro ZGEMV_N_2x2
GLD_INC v, , 0x10, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_N_1x2
GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
Y0, X1, A2, Y0, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_N_1x1
GLD_INC v, , 0x10, $vr12, PA0, 0
GCOMPLEXMADD GXCONJ, GCONJ, \
vf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
.endm
.macro ZSTORE_Y_2
GST v, , Y0, Y, 0, Y1, Y, 0x10
.endm
.macro ZSTORE_Y_2_GAP
vst Y0, Y, 0
vstx Y1, Y, INC_Y
.endm
.macro ZSTORE_Y_1
vst $vr10, Y, 0
.endm
.macro ZGEMV_N_LSX XW:req, X_2:req, X_1:req, Y_2:req, Y_1:req
PTR_SRLI J, N, 1
beqz J, .L_\XW\()_N_1
PTR_SLLI K_LDA, LDA, 1
PTR_SUB K_LDA, K_LDA, M16
.L_\XW\()_N_L2:
ZLOAD_\X_2
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 1
beqz I, .L_\XW\()_M_1
.align 5
.L_\XW\()_M_L2:
ZLOAD_\Y_2
ZGEMV_N_2x2
ZSTORE_\Y_2
PTR_ADDI I, I, -1
PTR_ALSL Y, INC_Y, Y, 1
PTR_ADDI K, K, 4
bnez I, .L_\XW\()_M_L2
.L_\XW\()_M_1:
andi I, M, 1
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
ZLOAD_\Y_1
ZGEMV_N_1x2
ZSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#endif
PTR_ALSL X, INC_X, X, 1
bnez J, .L_\XW\()_N_L2
.L_\XW\()_N_1:
andi J, N, 1
beqz J, .L_END
.L_\XW\()_N_L1:
ZLOAD_\X_1
xor K, K, K
move Y, Y_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
ZLOAD_\Y_1
ZGEMV_N_1x1
ZSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
PTR_SUB K_LDA, LDA, M16
PTR_ADD PA0, PA0, K_LDA
PTR_ADD X, X, INC_X
bnez J, .L_\XW\()_N_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 31
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
PTR_ALSL I, I, J, 1
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
// Init VALPHA
vpackev.d VALPHA, $vr1, $vr0
move Y_ORG, Y
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA
#else
GADD , d, PA1, PA0, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0 // Obtain the offset address
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0_0 - .L_GAP_TABLE
.hword .L_GAP_0_1 - .L_GAP_TABLE
.hword .L_GAP_1_0 - .L_GAP_TABLE
.hword .L_GAP_1_1 - .L_GAP_TABLE
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
ZGEMV_N_LSX GAP_0_0, X_2, X_1, Y_2, Y_1
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
ZGEMV_N_LSX GAP_0_1, X_2, X_1, Y_2_GAP, Y_1
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
ZGEMV_N_LSX GAP_1_0, X_2_GAP, X_1, Y_2, Y_1
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
ZGEMV_N_LSX GAP_1_1, X_2_GAP, X_1, Y_2_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 31
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -122,10 +122,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
GCOMPLEXMUL GXCONJ, \
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2
xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_X_4_GAP
@ -145,10 +145,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
xvpermi.q X3, X3, 0
GCOMPLEXMUL GXCONJ, \
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
X3, X3, VALPHA, TMP0, TMP1, TMP2
xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2, \
X1, VALPHA, X1, TMP0, TMP1, TMP2, \
X2, VALPHA, X2, TMP0, TMP1, TMP2, \
X3, VALPHA, X3, TMP0, TMP1, TMP2
.endm
.macro ZLOAD_Y_4
@ -216,7 +216,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
GLD xv, , X0, X, 0x00
GPERMI xv, q, X0, X0, 0
GCOMPLEXMUL GXCONJ, \
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2
xvf, d, X0, VALPHA, X0, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_N_1x1

View File

@ -0,0 +1,268 @@
/*******************************************************************************
Copyright (c) 2024, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA_R $f0
#define ALPHA_I $f1
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define PY0 $r14
#define X_ORG $r15
#define PY1 $r16
#define K_LDA $r17
#define PY2 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define M16 $r30
#define VALPHA $vr0
#define X0 $vr1
#define X1 $vr2
#define A0 $vr3
#define A1 $vr4
#define A2 $vr5
#define A3 $vr6
#define A4 $vr7
#define A5 $vr8
#define A6 $vr9
#define A7 $vr10
#define A8 $vr11
#define A9 $vr12
#define A10 $vr13
#define A11 $vr14
#define A12 $vr15
#define A13 $vr16
#define A14 $vr17
#define A15 $vr18
#define TP0 $vr19
#define TP1 $vr20
#define TP2 $vr21
#define TP3 $vr22
#define TP4 $vr23
#define TP5 $vr24
#define TP6 $vr25
#define TP7 $vr26
#define TMP0 $vr27
#define TMP1 $vr28
#define TMP2 $vr29
#define Y0 $vr3
#define Y1 $vr4
#define Y2 $vr5
#define Y3 $vr6
#define Y4 $vr7
#define Y5 $vr8
#define Y6 $vr9
#define Y7 $vr10
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
#define GXCONJ1 0
#define GCONJ1 0
#else
#define GXCONJ1 1
#define GCONJ1 0
#endif
#if !defined(XCONJ)
#define GXCONJ2 0
#define GCONJ2 0
#else
#define GXCONJ2 0
#define GCONJ2 1
#endif
.macro ZERO_Y2
GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1
.endm
.macro ZERO_Y1
GXOR v, v, TP0, TP0, TP0
.endm
.macro ZLOAD_X2
GLD v, , X0, X, 0x00, X1, X, 0x10
.endm
.macro ZLOAD_X2_GAP
vld X0, X, 0
vldx X1, X, INC_X
.endm
.macro ZGEMV_T_2x2
GLD_INC v, , 0x10, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2
.endm
.macro ZGEMV_T_LSX XW:req, X2:req
PTR_SRLI J, N, 1
beqz J, .L_\XW\()_N_1
PTR_SLLI K_LDA, LDA, 1
PTR_SUB K_LDA, K_LDA, M16
.L_\XW\()_N_L2:
ZERO_Y2
move X, X_ORG
PTR_SRLI I, M, 1
beqz I, .L_\XW\()_M_1
.align 5
.L_\XW\()_M_L2:
ZLOAD_\X2
ZGEMV_T_2x2
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 1
bnez I, .L_\XW\()_M_L2
.L_\XW\()_M_1:
// Accumulated
GCOMPLEXACC vf, d, Y0, TP0, Y1, TP1
andi I, M, 1
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
GLD v, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00
#if __loongarch_grlen == 64
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
#elif __loongarch_grlen == 32
GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10
#else
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10
#endif
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
vld A8, Y, 0x00
vldx A9, Y, INC_Y
GCOMPLEXMADD GXCONJ2, GCONJ2, \
vf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#elif __loongarch_grlen == 32
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#endif
vst $vr11, Y, 0x00
vstx $vr12, Y, INC_Y
PTR_ALSL Y, INC_Y, Y, 1
bnez J, .L_\XW\()_N_L2
.L_\XW\()_N_1:
andi J, N, 1
beqz J, .L_END
PTR_SUB K_LDA, LDA, M16
.L_\XW\()_N_L1:
ZERO_Y1
move X, X_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
GLD v, , A0, PA0, 0x00, X0, X, 0x00
GCOMPLEXMADD GXCONJ1, GCONJ1, \
vf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
PTR_ADDI PA0, PA0, 0x10
bnez I, .L_\XW\()_N_1_M_L1
.L_\XW\()_N_1_M_END:
PTR_ADDI J, J, -1
vld A0, Y, 0x00
GCOMPLEXMADD GXCONJ2, GCONJ2, \
vf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
vst $vr3, Y, 0x00
PTR_ADD PA0, PA0, K_LDA
PTR_ADD Y, Y, INC_Y
bnez J, .L_\XW\()_N_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 30
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
// Init VALPHA
vpackev.d VALPHA, $vr1, $vr0
move X_ORG, X
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA
#elif __loongarch_grlen == 32
GADD , w, PA1, PA0, LDA
#else
GADD , d, PA1, PA0, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0 - .L_GAP_TABLE
.hword .L_GAP_1 - .L_GAP_TABLE
.L_GAP_0: /* if (incx == 1) */
ZGEMV_T_LSX GAP_0, X2
.L_GAP_1: /* if (incx != 1) */
ZGEMV_T_LSX GAP_1, X2_GAP
.L_END:
pop_if_used 17 + 8, 30
jirl $r0, $r1, 0x0
EPILOGUE