Merge pull request #4516 from XiWeiGu/loongarch64-cgemv-zgemv-opt
Loongarch64 cgemv zgemv opt
This commit is contained in:
commit
e5c93d1f37
|
@ -121,6 +121,9 @@ CGEMMITCOPYOBJ = cgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|||
CGEMMONCOPYOBJ = cgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
CGEMMOTCOPYOBJ = cgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
CGEMVNKERNEL = cgemv_n_8_lasx.S
|
||||
CGEMVTKERNEL = cgemv_t_8_lasx.S
|
||||
|
||||
CTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
CTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
CTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
|
@ -136,6 +139,9 @@ ZGEMMITCOPYOBJ = zgemm_itcopy$(TSUFFIX).$(SUFFIX)
|
|||
ZGEMMONCOPYOBJ = zgemm_oncopy$(TSUFFIX).$(SUFFIX)
|
||||
ZGEMMOTCOPYOBJ = zgemm_otcopy$(TSUFFIX).$(SUFFIX)
|
||||
|
||||
ZGEMVNKERNEL = zgemv_n_4_lasx.S
|
||||
ZGEMVTKERNEL = zgemv_t_4_lasx.S
|
||||
|
||||
ZTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
|
||||
ZTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
|
||||
ZTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c
|
||||
|
|
|
@ -0,0 +1,383 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2024/02/20 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define Y_ORG $r15
|
||||
#define OFFSET $r16
|
||||
#define K_LDA $r17
|
||||
#define M8 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
|
||||
#define VALPHA $xr1
|
||||
#define X0 $xr2
|
||||
#define X1 $xr3
|
||||
#define X2 $xr4
|
||||
#define X3 $xr5
|
||||
#define X4 $xr6
|
||||
#define X5 $xr7
|
||||
#define X6 $xr8
|
||||
#define X7 $xr9
|
||||
#define Y0 $xr10
|
||||
#define Y1 $xr11
|
||||
#define A0 $xr12
|
||||
#define A1 $xr13
|
||||
#define A2 $xr14
|
||||
#define A3 $xr15
|
||||
#define A4 $xr16
|
||||
#define A5 $xr17
|
||||
#define A6 $xr18
|
||||
#define A7 $xr19
|
||||
#define A8 $xr20
|
||||
#define A9 $xr21
|
||||
#define A10 $xr22
|
||||
#define A11 $xr23
|
||||
#define A12 $xr24
|
||||
#define A13 $xr25
|
||||
#define A14 $xr26
|
||||
#define A15 $xr27
|
||||
#define TMP0 $xr28
|
||||
#define TMP1 $xr29
|
||||
#define TMP2 $xr30
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ 0
|
||||
#define GCONJ 0
|
||||
#else
|
||||
#define GXCONJ 1
|
||||
#define GCONJ 0
|
||||
#endif
|
||||
#else
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ 0
|
||||
#define GCONJ 1
|
||||
#else
|
||||
#define GXCONJ 1
|
||||
#define GCONJ 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.macro CLOAD_X_8
|
||||
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
|
||||
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X7, X7, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CLOAD_X_8_GAP
|
||||
xvldrepl.d X0, X, 0x00
|
||||
PTR_ADD T0, X, INC_X
|
||||
xvldrepl.d X1, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X2, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X3, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X4, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X5, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X6, T0, 0x00
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvldrepl.d X7, T0, 0x00
|
||||
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X3, X3, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X4, X4, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X5, X5, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X6, X6, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X7, X7, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CLOAD_Y_8
|
||||
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro CLOAD_Y_8_GAP
|
||||
fld.d $f10, Y, 0
|
||||
fldx.d $f13, Y, INC_Y
|
||||
PTR_ALSL T0, INC_Y, Y, 1
|
||||
fld.d $f14, T0, 0
|
||||
fldx.d $f15, T0, INC_Y
|
||||
PTR_ALSL T0, INC_Y, Y, 2
|
||||
fld.d $f11, T0, 0
|
||||
fldx.d $f17, T0, INC_Y
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
fld.d $f18, T0, 0
|
||||
fldx.d $f19, T0, INC_Y
|
||||
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
|
||||
.endm
|
||||
|
||||
.macro CSTORE_Y_8_GAP
|
||||
xvstelm.d Y0, Y, 0, 0
|
||||
PTR_ADD T0, Y, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 1
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 2
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 3
|
||||
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 0
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 1
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 2
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 3
|
||||
.endm
|
||||
|
||||
.macro CGEMV_N_8x8
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0, \
|
||||
A8, PA4, 0, A9, PA4, 0, \
|
||||
A10, PA5, 0, A11, PA5, 0, \
|
||||
A12, PA6, 0, A13, PA6, 0, \
|
||||
A14, PA7, 0, A15, PA7, 0
|
||||
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, Y1, X4, A9, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, Y1, X5, A11, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, Y1, X6, A13, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X7, A14, Y0, TMP0, TMP1, TMP2, Y1, X7, A15, Y1, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CSTORE_Y_8
|
||||
GST xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro CLOAD_X_1
|
||||
GLDREPL xv, d, X0, X, 0x00
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, s, X0, X0, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CLOAD_Y_1
|
||||
fld.d $f10, Y, 0
|
||||
.endm
|
||||
|
||||
.macro CGEMV_N_1x8
|
||||
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
|
||||
$f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X4, A8, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X5, A10, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X6, A12, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X7, A14, Y0, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CSTORE_Y_1
|
||||
fst.d $f10, Y, 0
|
||||
.endm
|
||||
|
||||
.macro CGEMV_N_1x1
|
||||
fld.d $f12, PA0, 0
|
||||
PTR_ADDI PA0, PA0, 0x08
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, s, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CGEMV_N_LASX XW:req, X_8:req, X_1:req, Y_8:req, Y_1:req
|
||||
PTR_SRLI J, N, 3
|
||||
beqz J, .L_\XW\()_N_7
|
||||
PTR_SLLI K_LDA, LDA, 3
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
.L_\XW\()_N_L8:
|
||||
CLOAD_\X_8
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_M_7
|
||||
.align 5
|
||||
.L_\XW\()_M_L8:
|
||||
CLOAD_\Y_8
|
||||
CGEMV_N_8x8
|
||||
CSTORE_\Y_8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL Y, INC_Y, Y, 3
|
||||
PTR_ADDI K, K, 8
|
||||
bnez I, .L_\XW\()_M_L8
|
||||
.L_\XW\()_M_7:
|
||||
andi I, M, 7
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
CLOAD_\Y_1
|
||||
CGEMV_N_1x8
|
||||
CSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#else
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#endif
|
||||
PTR_ALSL X, INC_X, X, 3
|
||||
bnez J, .L_\XW\()_N_L8
|
||||
.L_\XW\()_N_7:
|
||||
andi J, N, 7
|
||||
beqz J, .L_END
|
||||
.L_\XW\()_N_L1:
|
||||
CLOAD_\X_1
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
CLOAD_\Y_1
|
||||
CGEMV_N_1x1
|
||||
CSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
.L_\XW\()_N_1_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
PTR_SUB K_LDA, LDA, M8
|
||||
PTR_ADD PA0, PA0, K_LDA
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez J, .L_\XW\()_N_L1
|
||||
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 31
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
|
||||
PTR_ALSL I, I, J, 1
|
||||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
|
||||
// Init VALPHA
|
||||
xvpackev.w $xr0, $xr1, $xr0
|
||||
xvreplve0.d VALPHA, $xr0
|
||||
move Y_ORG, Y
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#else
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0 // Obtain the offset address
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_0_1 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_1 - .L_GAP_TABLE
|
||||
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
|
||||
CGEMV_N_LASX GAP_0_0, X_8, X_1, Y_8, Y_1
|
||||
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
|
||||
CGEMV_N_LASX GAP_0_1, X_8, X_1, Y_8_GAP, Y_1
|
||||
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
|
||||
CGEMV_N_LASX GAP_1_0, X_8_GAP, X_1, Y_8, Y_1
|
||||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
CGEMV_N_LASX GAP_1_1, X_8_GAP, X_1, Y_8_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 31
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
|
@ -0,0 +1,342 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2022/02/20 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define PY0 $r14
|
||||
#define X_ORG $r15
|
||||
#define PY1 $r16
|
||||
#define K_LDA $r17
|
||||
#define PY2 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
#define M8 $r30
|
||||
|
||||
#define VALPHA $xr0
|
||||
#define X0 $xr1
|
||||
#define X1 $xr2
|
||||
#define A0 $xr3
|
||||
#define A1 $xr4
|
||||
#define A2 $xr5
|
||||
#define A3 $xr6
|
||||
#define A4 $xr7
|
||||
#define A5 $xr8
|
||||
#define A6 $xr9
|
||||
#define A7 $xr10
|
||||
#define A8 $xr11
|
||||
#define A9 $xr12
|
||||
#define A10 $xr13
|
||||
#define A11 $xr14
|
||||
#define A12 $xr15
|
||||
#define A13 $xr16
|
||||
#define A14 $xr17
|
||||
#define A15 $xr18
|
||||
#define TP0 $xr19
|
||||
#define TP1 $xr20
|
||||
#define TP2 $xr21
|
||||
#define TP3 $xr22
|
||||
#define TP4 $xr23
|
||||
#define TP5 $xr24
|
||||
#define TP6 $xr25
|
||||
#define TP7 $xr26
|
||||
#define TMP0 $xr27
|
||||
#define TMP1 $xr28
|
||||
#define TMP2 $xr29
|
||||
#define Y0 $xr3
|
||||
#define Y1 $xr4
|
||||
#define Y2 $xr5
|
||||
#define Y3 $xr6
|
||||
#define Y4 $xr7
|
||||
#define Y5 $xr8
|
||||
#define Y6 $xr9
|
||||
#define Y7 $xr10
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
#define GXCONJ1 0
|
||||
#define GCONJ1 0
|
||||
#else
|
||||
#define GXCONJ1 1
|
||||
#define GCONJ1 0
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ2 0
|
||||
#define GCONJ2 0
|
||||
#else
|
||||
#define GXCONJ2 0
|
||||
#define GCONJ2 1
|
||||
#endif
|
||||
|
||||
.macro ZERO_Y8
|
||||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
|
||||
TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
|
||||
.endm
|
||||
|
||||
.macro ZERO_Y1
|
||||
GXOR xv, v, TP0, TP0, TP0
|
||||
.endm
|
||||
|
||||
.macro CLOAD_X8
|
||||
GLD xv, , X0, X, 0x00, X1, X, 0x20
|
||||
.endm
|
||||
|
||||
.macro CLOAD_X8_GAP
|
||||
fld.d $f1, X, 0x00
|
||||
fldx.d $f2, X, INC_X
|
||||
PTR_ALSL T0, INC_X, X, 1
|
||||
fld.d $f3, T0, 0x00
|
||||
fldx.d $f4, T0, INC_X
|
||||
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
|
||||
PTR_ALSL T0, INC_X, X, 2
|
||||
fld.d $f2, T0, 0x00
|
||||
fldx.d $f3, T0, INC_X
|
||||
PTR_ALSL T0, INC_X, T0, 1
|
||||
fld.d $f4, T0, 0x00
|
||||
fldx.d $f5, T0, INC_X
|
||||
GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
|
||||
.endm
|
||||
|
||||
.macro CGEMV_T_8x8
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0, \
|
||||
A8, PA4, 0, A9, PA4, 0, \
|
||||
A10, PA5, 0, A11, PA5, 0, \
|
||||
A12, PA6, 0, A13, PA6, 0, \
|
||||
A14, PA7, 0, A15, PA7, 0
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
|
||||
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
|
||||
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
|
||||
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2, \
|
||||
TP4, A8, X0, TP4, TMP0, TMP1, TMP2, TP4, A9, X1, TP4, TMP0, TMP1, TMP2, \
|
||||
TP5, A10, X0, TP5, TMP0, TMP1, TMP2, TP5, A11, X1, TP5, TMP0, TMP1, TMP2, \
|
||||
TP6, A12, X0, TP6, TMP0, TMP1, TMP2, TP6, A13, X1, TP6, TMP0, TMP1, TMP2, \
|
||||
TP7, A14, X0, TP7, TMP0, TMP1, TMP2, TP7, A15, X1, TP7, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro CGEMV_T_LASX XW:req, X8:req
|
||||
PTR_SRLI J, N, 3
|
||||
beqz J, .L_\XW\()_N_7
|
||||
PTR_SLLI K_LDA, LDA, 3
|
||||
PTR_SUB K_LDA, K_LDA, M8
|
||||
.L_\XW\()_N_L8:
|
||||
ZERO_Y8
|
||||
move X, X_ORG
|
||||
PTR_SRLI I, M, 3
|
||||
beqz I, .L_\XW\()_M_7
|
||||
.align 5
|
||||
.L_\XW\()_M_L8:
|
||||
CLOAD_\X8
|
||||
CGEMV_T_8x8
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL X, INC_X, X, 3
|
||||
bnez I, .L_\XW\()_M_L8
|
||||
.L_\XW\()_M_7:
|
||||
// Accumulated
|
||||
GCOMPLEXACC xvf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
|
||||
Y5, TP5, Y6, TP6, Y7, TP7
|
||||
andi I, M, 7
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
fld.d $f1, X, 0x00
|
||||
fld.d $f11, PA0, 0x00
|
||||
fld.d $f12, PA1, 0x00
|
||||
fld.d $f13, PA2, 0x00
|
||||
fld.d $f14, PA3, 0x00
|
||||
fld.d $f15, PA4, 0x00
|
||||
fld.d $f16, PA5, 0x00
|
||||
fld.d $f17, PA6, 0x00
|
||||
fld.d $f18, PA7, 0x00
|
||||
#if __loongarch_grlen == 64
|
||||
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
|
||||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
|
||||
#elif __loongarch_grlen == 32
|
||||
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
|
||||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
|
||||
#else
|
||||
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
|
||||
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
|
||||
#endif
|
||||
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
|
||||
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2, \
|
||||
A4, A12, X0, A4, TMP0, TMP1, TMP2, A5, A13, X0, A5, TMP0, TMP1, TMP2, \
|
||||
A6, A14, X0, A6, TMP0, TMP1, TMP2, A7, A15, X0, A7, TMP0, TMP1, TMP2
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
fld.d $f11, Y, 0x00
|
||||
fldx.d $f12, Y, INC_Y
|
||||
PTR_ALSL PY0, INC_Y, Y, 1
|
||||
fld.d $f13, PY0, 0x00
|
||||
fldx.d $f14, PY0, INC_Y
|
||||
PTR_ALSL PY1, INC_Y, Y, 2
|
||||
fld.d $f15, PY1, 0x00
|
||||
fldx.d $f16, PY1, INC_Y
|
||||
PTR_ALSL PY2, INC_Y, PY1, 1
|
||||
fld.d $f17, PY2, 0x00
|
||||
fldx.d $f18, PY2, INC_Y
|
||||
|
||||
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||
xvf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
|
||||
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2,\
|
||||
A12, VALPHA, A4, A12, TMP0, TMP1, TMP2, A13, VALPHA, A5, A13, TMP0, TMP1, TMP2,\
|
||||
A14, VALPHA, A6, A14, TMP0, TMP1, TMP2, A15, VALPHA, A7, A15, TMP0, TMP1, TMP2
|
||||
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#else
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
|
||||
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
|
||||
#endif
|
||||
fst.d $f11, Y, 0x00
|
||||
fstx.d $f12, Y, INC_Y
|
||||
fst.d $f13, PY0, 0x00
|
||||
fstx.d $f14, PY0, INC_Y
|
||||
fst.d $f15, PY1, 0x00
|
||||
fstx.d $f16, PY1, INC_Y
|
||||
fst.d $f17, PY2, 0x00
|
||||
fstx.d $f18, PY2, INC_Y
|
||||
PTR_ALSL Y, INC_Y, Y, 3
|
||||
bnez J, .L_\XW\()_N_L8
|
||||
.L_\XW\()_N_7:
|
||||
andi J, N, 7
|
||||
beqz J, .L_END
|
||||
PTR_SUB K_LDA, LDA, M8
|
||||
.L_\XW\()_N_1:
|
||||
ZERO_Y1
|
||||
move X, X_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
fld.d $f3, PA0, 0x00
|
||||
fld.d $f1, X, 0x00
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
PTR_ADDI PA0, PA0, 0x08
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
.L_\XW\()_N_1_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
fld.d $f3, Y, 0x00
|
||||
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||
xvf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
|
||||
fst.d $f3, Y, 0x00
|
||||
PTR_ADD PA0, PA0, K_LDA
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
bnez J, .L_\XW\()_N_1
|
||||
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 30
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
|
||||
// Init VALPHA
|
||||
xvpackev.w $xr0, $xr1, $xr0
|
||||
xvreplve0.d VALPHA, $xr0
|
||||
move X_ORG, X
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#else
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
|
||||
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1 - .L_GAP_TABLE
|
||||
.L_GAP_0: /* if (incx == 1) */
|
||||
CGEMV_T_LASX GAP_0, X8
|
||||
.L_GAP_1: /* if (incx != 1) */
|
||||
CGEMV_T_LASX GAP_1, X8_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 30
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
|
@ -384,6 +384,246 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// GCOMPLEXACC: Complex accumulate the values of vector registers
|
||||
// pre_op: xvf or vf, differentiate between LSX or LASX instruction
|
||||
// suf_op: s or d, differentiate between single precision or double precision complex numbers
|
||||
// Note: When "pre_op = xvf && suf_op = s", in will be modified.
|
||||
//
|
||||
.macro GCOMPLEXACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
xvpermi.q \out, \in, 0x01
|
||||
.ifeqs "\suf_op", "s"
|
||||
\pre_op\()add.\suf_op \in, \out, \in
|
||||
xvpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.else
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifeqs "\pre_op", "vf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackod.d \out, \in, \in
|
||||
\pre_op\()add.\suf_op \out, \out, \in
|
||||
.endif
|
||||
.endif
|
||||
|
||||
.ifnb \more
|
||||
GCOMPLEXACC \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// GCOMPLEXMUL: Complex multiplication, out = in0 * in1
|
||||
// xconj: default value 0.
|
||||
// if !(xconj)
|
||||
// out_r = in0_r * in1_r - in0_i * in1_i;
|
||||
// out_i = in0_r * in1_i + in0_i * in1_r;
|
||||
// else
|
||||
// out_r = in0_r * in1_r + in0_i * in1_i;
|
||||
// out_i = in0_r * in1_i - in0_i * in1_r;
|
||||
// pre_op: xvf or vf, differentiate between LSX or LASX instruction
|
||||
// suf_op: s or d, differentiate between single precision or double precision complex numbers
|
||||
//
|
||||
.macro GCOMPLEXMUL xconj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
xvxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
xvpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.else
|
||||
vxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
vpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.endif
|
||||
|
||||
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
|
||||
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
xvshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
xvshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
vshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
vshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.endif
|
||||
|
||||
\pre_op\()mul.\suf_op \out, \tmp0, \in1
|
||||
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out
|
||||
|
||||
.ifnb \more
|
||||
GCOMPLEXMUL \xconj, \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// GCOMPLEXMADD: Complex multiply-accumulate, out = in0 * in1 + in2
|
||||
// xconj: default value 0
|
||||
// conj: default value 0
|
||||
// if !(CONJ)
|
||||
// if !(XCONJ)
|
||||
// out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
|
||||
// out_i = in0_r * in1_i + in0_i * in1_r + in2_i;
|
||||
// else
|
||||
// out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
|
||||
// out_i = in0_r * in1_i - in0_i * in1_r + in2_i;
|
||||
// else
|
||||
// if !(XCONJ)
|
||||
// out_r = in0_r * in1_r + in0_i * in1_i + in2_r;
|
||||
// out_i = in2_i - (in0_r * in1_i - in0_i * in1_r);
|
||||
// else
|
||||
// out_r = in0_r * in1_r - in0_i * in1_i + in2_r;
|
||||
// out_i = in2_i - (in0_r * in1_i + in0_i * in1_r);
|
||||
// pre_op: xvf or vf, differentiate between LSX or LASX instruction
|
||||
// suf_op: s or d, differentiate between single precision or double precision complex numbers
|
||||
//
|
||||
.macro GCOMPLEXMADD xconj=0, conj=0, pre_op:req, suf_op:req, out:req, in0:req, in1:req, in2:req, tmp0:req, tmp1:req, tmp2:req, more:vararg
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
xvxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
xvpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.else
|
||||
vxor.v \tmp1, \tmp1, \tmp1
|
||||
.ifeqs "\suf_op", "s"
|
||||
vpackev.w \tmp0, \in0, \in0
|
||||
.else
|
||||
vpackev.d \tmp0, \in0, \in0
|
||||
.endif
|
||||
.endif
|
||||
|
||||
\pre_op\()madd.\suf_op \tmp2, \tmp0, \in1, \in2
|
||||
.ifeqs "\conj", "1"
|
||||
\pre_op\()nmsub.\suf_op \tmp0, \tmp0, \in1, \in2
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
xvshuf4i.w \tmp0, \tmp0, 0xb1
|
||||
xvpackev.w \out, \tmp0, \tmp2
|
||||
.else
|
||||
xvshuf4i.d \tmp0, \tmp0, 0x0b
|
||||
xvpackev.d \out, \tmp0, \tmp2
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\suf_op", "s"
|
||||
vshuf4i.w \tmp0, \tmp0, 0xb1
|
||||
vpackev.w \out, \tmp0, \tmp2
|
||||
.else
|
||||
vshuf4i.d \tmp0, \tmp0, 0x0b
|
||||
vpackev.d \out, \tmp0, \tmp2
|
||||
.endif
|
||||
.endif /* pre_op = xvf */
|
||||
.else
|
||||
\pre_op\()add.\suf_op \out, \tmp2, \tmp1
|
||||
.endif /* conj = 1 */
|
||||
|
||||
\pre_op\()sub.\suf_op \tmp1, \tmp1, \in0
|
||||
|
||||
.ifeqs "\pre_op", "xvf"
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.w \tmp1, \in0, \in0
|
||||
.else
|
||||
xvpackod.w \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
xvshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
xvpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
xvpackod.d \tmp1, \in0, \in0
|
||||
.else
|
||||
xvpackod.d \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
xvshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\suf_op", "s"
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.w \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.w \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.w \tmp1, \in0, \in0
|
||||
.else
|
||||
vpackod.w \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
vshuf4i.w \tmp2, \in1, 0xb1
|
||||
.else
|
||||
.ifeqs "\conj", "0"
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.d \tmp1, \in0, \tmp1
|
||||
.else
|
||||
vpackod.d \tmp1, \tmp1, \in0
|
||||
.endif
|
||||
.else
|
||||
.ifeqs "\xconj", "0"
|
||||
vpackod.d \tmp1, \in0, \in0
|
||||
.else
|
||||
vpackod.d \tmp1, \tmp1, \tmp1
|
||||
.endif
|
||||
.endif
|
||||
vshuf4i.d \tmp2, \in1, 0x0b
|
||||
.endif
|
||||
.endif
|
||||
|
||||
\pre_op\()madd.\suf_op \out, \tmp1, \tmp2, \out
|
||||
|
||||
.ifnb \more
|
||||
GCOMPLEXMADD \xconj, \conj, \pre_op, \suf_op, \more
|
||||
.endif
|
||||
.endm
|
||||
|
||||
//
|
||||
// Media Related Macros
|
||||
//
|
||||
|
|
|
@ -0,0 +1,343 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2024/02/20 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define Y_ORG $r15
|
||||
#define OFFSET $r16
|
||||
#define K_LDA $r17
|
||||
#define M16 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
|
||||
#define VALPHA $xr1
|
||||
#define X0 $xr2
|
||||
#define X1 $xr3
|
||||
#define X2 $xr4
|
||||
#define X3 $xr5
|
||||
#define X4 $xr6
|
||||
#define X5 $xr7
|
||||
#define X6 $xr8
|
||||
#define X7 $xr9
|
||||
#define Y0 $xr10
|
||||
#define Y1 $xr11
|
||||
#define A0 $xr12
|
||||
#define A1 $xr13
|
||||
#define A2 $xr14
|
||||
#define A3 $xr15
|
||||
#define A4 $xr16
|
||||
#define A5 $xr17
|
||||
#define A6 $xr18
|
||||
#define A7 $xr19
|
||||
#define A8 $xr20
|
||||
#define A9 $xr21
|
||||
#define A10 $xr22
|
||||
#define A11 $xr23
|
||||
#define A12 $xr24
|
||||
#define A13 $xr25
|
||||
#define A14 $xr26
|
||||
#define A15 $xr27
|
||||
#define TMP0 $xr28
|
||||
#define TMP1 $xr29
|
||||
#define TMP2 $xr30
|
||||
|
||||
#if !defined(CONJ)
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ 0
|
||||
#define GCONJ 0
|
||||
#else
|
||||
#define GXCONJ 1
|
||||
#define GCONJ 0
|
||||
#endif
|
||||
#else
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ 0
|
||||
#define GCONJ 1
|
||||
#else
|
||||
#define GXCONJ 1
|
||||
#define GCONJ 1
|
||||
#endif
|
||||
#endif
|
||||
|
||||
.macro ZLOAD_X_4
|
||||
GLD xv, , X0, X, 0x00, X1, X, 0x10, X2, X, 0x20, X3, X, 0x30
|
||||
GPERMI xv, q, X0, X0, 0, X1, X1, 0, X2, X2, 0, X3, X3, 0
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X3, X3, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_X_4_GAP
|
||||
xvld X0, X, 0
|
||||
xvpermi.q X0, X0, 0
|
||||
|
||||
PTR_ADD T0, X, INC_X
|
||||
xvld X1, T0, 0
|
||||
xvpermi.q X1, X1, 0
|
||||
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvld X2, T0, 0
|
||||
xvpermi.q X2, X2, 0
|
||||
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvld X3, T0, 0
|
||||
xvpermi.q X3, X3, 0
|
||||
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X1, X1, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X2, X2, VALPHA, TMP0, TMP1, TMP2, \
|
||||
X3, X3, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_Y_4
|
||||
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_Y_4_GAP
|
||||
vld $vr10, Y, 0
|
||||
vldx $vr13, Y, INC_Y
|
||||
PTR_ALSL T0, INC_Y, Y, 1
|
||||
vld $vr11, T0, 0
|
||||
vldx $vr14, T0, INC_Y
|
||||
GPERMI xv, q, Y0, A1, 0x02, Y1, A2, 0x02
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_N_4x4
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, Y1, X0, A1, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, Y1, X1, A3, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, Y1, X2, A5, Y1, TMP0, TMP1, TMP2, \
|
||||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2, Y1, X3, A7, Y1, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZSTORE_Y_4
|
||||
GST xv, , Y0, Y, 0, Y1, Y, 0x20
|
||||
.endm
|
||||
|
||||
.macro ZSTORE_Y_4_GAP
|
||||
xvstelm.d Y0, Y, 0, 0
|
||||
xvstelm.d Y0, Y, 0x08, 1
|
||||
PTR_ADD T0, Y, INC_Y
|
||||
xvstelm.d Y0, T0, 0, 2
|
||||
xvstelm.d Y0, T0, 0x08, 3
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 0
|
||||
xvstelm.d Y1, T0, 0x08, 1
|
||||
PTR_ADD T0, T0, INC_Y
|
||||
xvstelm.d Y1, T0, 0, 2
|
||||
xvstelm.d Y1, T0, 0x08, 3
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_Y_1
|
||||
vld $vr10, Y, 0
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_N_1x4
|
||||
GLD_INC v, , 0x10, $vr12, PA0, 0, $vr14, PA1, 0, $vr16, PA2, 0, $vr18, PA3, 0
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X1, A2, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X2, A4, Y0, TMP0, TMP1, TMP2, \
|
||||
Y0, X3, A6, Y0, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZSTORE_Y_1
|
||||
vst $vr10, Y, 0
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_X_1
|
||||
GLD xv, , X0, X, 0x00
|
||||
GPERMI xv, q, X0, X0, 0
|
||||
GCOMPLEXMUL GXCONJ, \
|
||||
xvf, d, X0, X0, VALPHA, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_N_1x1
|
||||
GLD_INC v, , 0x10, $vr12, PA0, 0
|
||||
GCOMPLEXMADD GXCONJ, GCONJ, \
|
||||
xvf, d, Y0, X0, A0, Y0, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_N_LASX XW:req, X_4:req, X_1:req, Y_4:req, Y_1:req
|
||||
PTR_SRLI J, N, 2
|
||||
beqz J, .L_\XW\()_N_3
|
||||
PTR_SLLI K_LDA, LDA, 2
|
||||
PTR_SUB K_LDA, K_LDA, M16
|
||||
.L_\XW\()_N_L4:
|
||||
ZLOAD_\X_4
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
PTR_SRLI I, M, 2
|
||||
beqz I, .L_\XW\()_M_3
|
||||
.align 5
|
||||
.L_\XW\()_M_L4:
|
||||
ZLOAD_\Y_4
|
||||
ZGEMV_N_4x4
|
||||
ZSTORE_\Y_4
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL Y, INC_Y, Y, 2
|
||||
PTR_ADDI K, K, 4
|
||||
bnez I, .L_\XW\()_M_L4
|
||||
.L_\XW\()_M_3:
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
ZLOAD_\Y_1
|
||||
ZGEMV_N_1x4
|
||||
ZSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#else
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#endif
|
||||
PTR_ALSL X, INC_X, X, 2
|
||||
bnez J, .L_\XW\()_N_L4
|
||||
.L_\XW\()_N_3:
|
||||
andi J, N, 3
|
||||
beqz J, .L_END
|
||||
.L_\XW\()_N_L1:
|
||||
ZLOAD_\X_1
|
||||
xor K, K, K
|
||||
move Y, Y_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
ZLOAD_\Y_1
|
||||
ZGEMV_N_1x1
|
||||
ZSTORE_\Y_1
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
PTR_ADDI K, K, 1
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
.L_\XW\()_N_1_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
PTR_SUB K_LDA, LDA, M16
|
||||
PTR_ADD PA0, PA0, K_LDA
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez J, .L_\XW\()_N_L1
|
||||
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 7, 31
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
PTR_SUB J, INC_Y, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
|
||||
PTR_ALSL I, I, J, 1
|
||||
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
|
||||
// Init VALPHA
|
||||
xvpackev.d $xr0, $xr1, $xr0
|
||||
xvreplve0.q VALPHA, $xr0
|
||||
move Y_ORG, Y
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
|
||||
#else
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0 // Obtain the offset address
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_0_1 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1_1 - .L_GAP_TABLE
|
||||
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
|
||||
ZGEMV_N_LASX GAP_0_0, X_4, X_1, Y_4, Y_1
|
||||
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
|
||||
ZGEMV_N_LASX GAP_0_1, X_4, X_1, Y_4_GAP, Y_1
|
||||
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
|
||||
ZGEMV_N_LASX GAP_1_0, X_4_GAP, X_1, Y_4, Y_1
|
||||
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
|
||||
ZGEMV_N_LASX GAP_1_1, X_4_GAP, X_1, Y_4_GAP, Y_1
|
||||
.L_END:
|
||||
pop_if_used 17 + 7, 31
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
||||
|
|
@ -0,0 +1,299 @@
|
|||
/*******************************************************************************
|
||||
Copyright (c) 2024, The OpenBLAS Project
|
||||
All rights reserved.
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions are
|
||||
met:
|
||||
1. Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
2. Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in
|
||||
the documentation and/or other materials provided with the
|
||||
distribution.
|
||||
3. Neither the name of the OpenBLAS project nor the names of
|
||||
its contributors may be used to endorse or promote products
|
||||
derived from this software without specific prior written permission.
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*******************************************************************************/
|
||||
#define ASSEMBLER
|
||||
|
||||
#include "common.h"
|
||||
#include "loongarch64_asm.S"
|
||||
|
||||
/*********************************************************************
|
||||
* 2024/02/20 guxiwei
|
||||
* UTEST : OK
|
||||
* CTEST : OK
|
||||
* TEST : OK
|
||||
*
|
||||
*
|
||||
*********************************************************************/
|
||||
|
||||
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
||||
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
||||
*/
|
||||
#define M $r4
|
||||
#define N $r5
|
||||
#define ALPHA_R $f0
|
||||
#define ALPHA_I $f1
|
||||
#define A $r7
|
||||
#define LDA $r8
|
||||
#define X $r9
|
||||
#define INC_X $r10
|
||||
#define Y $r11
|
||||
#define INC_Y $r6
|
||||
|
||||
#define J $r12
|
||||
#define I $r13
|
||||
#define K $r14
|
||||
#define PY0 $r14
|
||||
#define X_ORG $r15
|
||||
#define PY1 $r16
|
||||
#define K_LDA $r17
|
||||
#define PY2 $r18
|
||||
#define T0 $r19
|
||||
#define PA0 $r20
|
||||
#define PA1 $r23
|
||||
#define PA2 $r24
|
||||
#define PA3 $r25
|
||||
#define PA4 $r26
|
||||
#define PA5 $r27
|
||||
#define PA6 $r28
|
||||
#define PA7 $r29
|
||||
#define M16 $r30
|
||||
|
||||
#define VALPHA $xr0
|
||||
#define X0 $xr1
|
||||
#define X1 $xr2
|
||||
#define A0 $xr3
|
||||
#define A1 $xr4
|
||||
#define A2 $xr5
|
||||
#define A3 $xr6
|
||||
#define A4 $xr7
|
||||
#define A5 $xr8
|
||||
#define A6 $xr9
|
||||
#define A7 $xr10
|
||||
#define A8 $xr11
|
||||
#define A9 $xr12
|
||||
#define A10 $xr13
|
||||
#define A11 $xr14
|
||||
#define A12 $xr15
|
||||
#define A13 $xr16
|
||||
#define A14 $xr17
|
||||
#define A15 $xr18
|
||||
#define TP0 $xr19
|
||||
#define TP1 $xr20
|
||||
#define TP2 $xr21
|
||||
#define TP3 $xr22
|
||||
#define TP4 $xr23
|
||||
#define TP5 $xr24
|
||||
#define TP6 $xr25
|
||||
#define TP7 $xr26
|
||||
#define TMP0 $xr27
|
||||
#define TMP1 $xr28
|
||||
#define TMP2 $xr29
|
||||
#define Y0 $xr3
|
||||
#define Y1 $xr4
|
||||
#define Y2 $xr5
|
||||
#define Y3 $xr6
|
||||
#define Y4 $xr7
|
||||
#define Y5 $xr8
|
||||
#define Y6 $xr9
|
||||
#define Y7 $xr10
|
||||
|
||||
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
||||
#define GXCONJ1 0
|
||||
#define GCONJ1 0
|
||||
#else
|
||||
#define GXCONJ1 1
|
||||
#define GCONJ1 0
|
||||
#endif
|
||||
|
||||
#if !defined(XCONJ)
|
||||
#define GXCONJ2 0
|
||||
#define GCONJ2 0
|
||||
#else
|
||||
#define GXCONJ2 0
|
||||
#define GCONJ2 1
|
||||
#endif
|
||||
|
||||
.macro ZERO_Y4
|
||||
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
|
||||
.endm
|
||||
|
||||
.macro ZERO_Y1
|
||||
GXOR xv, v, TP0, TP0, TP0
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_X4
|
||||
GLD xv, , X0, X, 0x00, X1, X, 0x20
|
||||
.endm
|
||||
|
||||
.macro ZLOAD_X4_GAP
|
||||
xvld X0, X, 0
|
||||
|
||||
PTR_ADD T0, X, INC_X
|
||||
xvld A0, T0, 0
|
||||
xvpermi.q X0, A0, 0x02
|
||||
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvld X1, T0, 0
|
||||
|
||||
PTR_ADD T0, T0, INC_X
|
||||
xvld A0, T0, 0
|
||||
xvpermi.q X1, A0, 0x02
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_T_4x4
|
||||
GLD_INC xv, , 0x20, \
|
||||
A0, PA0, 0, A1, PA0, 0, \
|
||||
A2, PA1, 0, A3, PA1, 0, \
|
||||
A4, PA2, 0, A5, PA2, 0, \
|
||||
A6, PA3, 0, A7, PA3, 0
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
|
||||
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
|
||||
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
|
||||
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2
|
||||
.endm
|
||||
|
||||
.macro ZGEMV_T_LASX XW:req, X4:req
|
||||
PTR_SRLI J, N, 2
|
||||
beqz J, .L_\XW\()_N_3
|
||||
PTR_SLLI K_LDA, LDA, 2
|
||||
PTR_SUB K_LDA, K_LDA, M16
|
||||
.L_\XW\()_N_L4:
|
||||
ZERO_Y4
|
||||
move X, X_ORG
|
||||
PTR_SRLI I, M, 2
|
||||
beqz I, .L_\XW\()_M_3
|
||||
.align 5
|
||||
.L_\XW\()_M_L4:
|
||||
ZLOAD_\X4
|
||||
ZGEMV_T_4x4
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ALSL X, INC_X, X, 2
|
||||
bnez I, .L_\XW\()_M_L4
|
||||
.L_\XW\()_M_3:
|
||||
// Accumulated
|
||||
GCOMPLEXACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
|
||||
andi I, M, 3
|
||||
beqz I, .L_\XW\()_M_END
|
||||
.align 5
|
||||
.L_\XW\()_M_L1:
|
||||
GLD xv, , X0, X, 0x00, A8, PA0, 0x00, A9, PA1, 0x00, A10, PA2, 0x00, A11, PA3, 0x00
|
||||
#if __loongarch_grlen == 64
|
||||
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
|
||||
#elif __loongarch_grlen == 32
|
||||
GADDI , w, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
|
||||
#else
|
||||
GADDI , d, PA0, PA0, 0x10, PA1, PA1, 0x10, PA2, PA2, 0x10, PA3, PA3, 0x10
|
||||
#endif
|
||||
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, d, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
|
||||
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
|
||||
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
bnez I, .L_\XW\()_M_L1
|
||||
.L_\XW\()_M_END:
|
||||
xvld A8, Y, 0x00
|
||||
xvldx A9, Y, INC_Y
|
||||
PTR_ALSL PY0, INC_Y, Y, 1
|
||||
xvld A10, PY0, 0x00
|
||||
xvldx A11, PY0, INC_Y
|
||||
|
||||
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||
xvf, d, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
|
||||
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
|
||||
|
||||
PTR_ADDI J, J, -1
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#else
|
||||
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
||||
#endif
|
||||
vst $vr11, Y, 0x00
|
||||
vstx $vr12, Y, INC_Y
|
||||
vst $vr13, PY0, 0x00
|
||||
vstx $vr14, PY0, INC_Y
|
||||
PTR_ALSL Y, INC_Y, Y, 2
|
||||
bnez J, .L_\XW\()_N_L4
|
||||
.L_\XW\()_N_3:
|
||||
andi J, N, 3
|
||||
beqz J, .L_END
|
||||
PTR_SUB K_LDA, LDA, M16
|
||||
.L_\XW\()_N_1:
|
||||
ZERO_Y1
|
||||
move X, X_ORG
|
||||
move I, M
|
||||
beqz I, .L_END
|
||||
.align 5
|
||||
.L_\XW\()_N_1_M_L1:
|
||||
GLD xv, , A0, PA0, 0x00, X0, X, 0x00
|
||||
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
||||
xvf, d, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
|
||||
PTR_ADDI I, I, -1
|
||||
PTR_ADD X, X, INC_X
|
||||
PTR_ADDI PA0, PA0, 0x10
|
||||
bnez I, .L_\XW\()_N_1_M_L1
|
||||
.L_\XW\()_N_1_M_END:
|
||||
PTR_ADDI J, J, -1
|
||||
xvld A0, Y, 0x00
|
||||
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
||||
xvf, d, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
|
||||
vst $vr3, Y, 0x00
|
||||
PTR_ADD PA0, PA0, K_LDA
|
||||
PTR_ADD Y, Y, INC_Y
|
||||
bnez J, .L_\XW\()_N_1
|
||||
|
||||
b .L_END
|
||||
.endm
|
||||
|
||||
PROLOGUE
|
||||
PTR_LD INC_Y, $sp, 0
|
||||
push_if_used 17 + 8, 30
|
||||
PTR_ADDI K, $r0, 0x01
|
||||
PTR_SUB I, INC_X, K
|
||||
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
||||
GSLLI , d, LDA, LDA, 4, INC_X, INC_X, 4, INC_Y, INC_Y, 4, M16, M, 4
|
||||
// Init VALPHA
|
||||
xvpackev.d $xr0, $xr1, $xr0
|
||||
xvreplve0.q VALPHA, $xr0
|
||||
move X_ORG, X
|
||||
move PA0, A
|
||||
#if __loongarch_grlen == 64
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||
#elif __loongarch_grlen == 32
|
||||
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||
#else
|
||||
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
||||
#endif
|
||||
la.local T0, .L_GAP_TABLE
|
||||
PTR_ALSL I, I, T0, 1
|
||||
ld.h K, I, 0
|
||||
PTR_ADD T0, T0, K
|
||||
jirl $r0, T0, 0
|
||||
.L_GAP_TABLE:
|
||||
.hword .L_GAP_0 - .L_GAP_TABLE
|
||||
.hword .L_GAP_1 - .L_GAP_TABLE
|
||||
.L_GAP_0: /* if (incx == 1) */
|
||||
ZGEMV_T_LASX GAP_0, X4
|
||||
.L_GAP_1: /* if (incx != 1) */
|
||||
ZGEMV_T_LASX GAP_1, X4_GAP
|
||||
.L_END:
|
||||
pop_if_used 17 + 8, 30
|
||||
jirl $r0, $r1, 0x0
|
||||
EPILOGUE
|
Loading…
Reference in New Issue