LoongArch64: Add dgemv_t_8_lasx.S and dgemv_n_8_lasx.S

This commit is contained in:
gxw 2023-07-11 10:01:12 +08:00
parent 8a171350db
commit ec1e96aac8
5 changed files with 1334 additions and 0 deletions

View File

@ -8,6 +8,9 @@ DGEMMITCOPYOBJ = dgemm_itcopy$(TSUFFIX).$(SUFFIX)
DGEMMONCOPYOBJ = dgemm_oncopy$(TSUFFIX).$(SUFFIX)
DGEMMOTCOPYOBJ = dgemm_otcopy$(TSUFFIX).$(SUFFIX)
DGEMVNKERNEL = dgemv_n_8_lasx.S
DGEMVTKERNEL = dgemv_t_8_lasx.S
DTRSMKERNEL_LN = ../generic/trsm_kernel_LN.c
DTRSMKERNEL_LT = ../generic/trsm_kernel_LT.c
DTRSMKERNEL_RN = ../generic/trsm_kernel_RN.c

View File

@ -132,12 +132,16 @@ CSWAPKERNEL = ../arm/zswap.c
ZSWAPKERNEL = ../arm/zswap.c
SGEMVNKERNEL = ../arm/gemv_n.c
ifndef DGEMVNKERNEL
DGEMVNKERNEL = ../arm/gemv_n.c
endif
CGEMVNKERNEL = ../arm/zgemv_n.c
ZGEMVNKERNEL = ../arm/zgemv_n.c
SGEMVTKERNEL = ../arm/gemv_t.c
ifndef DGEMVTKERNEL
DGEMVTKERNEL = ../arm/gemv_t.c
endif
CGEMVTKERNEL = ../arm/zgemv_t.c
ZGEMVTKERNEL = ../arm/zgemv_t.c

View File

@ -0,0 +1,546 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/07/14 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
*********************************************************************/
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA $f0
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define Y_ORG $r15
#define OFFSET $r16
#define K_LDA $r17
#define M8 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define VALPHA $xr1
#define X0 $xr2
#define X1 $xr3
#define X2 $xr4
#define X3 $xr5
#define X4 $xr6
#define X5 $xr7
#define X6 $xr8
#define X7 $xr9
#define Y0 $xr10
#define Y1 $xr11
#define A0 $xr12
#define A1 $xr13
#define A2 $xr14
#define A3 $xr15
#define A4 $xr16
#define A5 $xr17
#define A6 $xr18
#define A7 $xr19
#define A8 $xr20
#define A9 $xr21
#define A10 $xr22
#define A11 $xr23
#define A12 $xr24
#define A13 $xr25
#define A14 $xr26
#define A15 $xr27
.macro DLOAD_X_8
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18, \
X4, X, 0x20, X5, X, 0x28, X6, X, 0x30, X7, X, 0x38
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
.endm
.macro DLOAD_X_4
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08, X2, X, 0x10, X3, X, 0x18
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
.endm
.macro DLOAD_X_2
GLDREPL xv, d, X0, X, 0x00, X1, X, 0x08
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
.endm
.macro DLOAD_X_1
GLDREPL xv, d, X0, X, 0x00
GMUL xvf, d, X0, X0, VALPHA
.endm
.macro DLOAD_Y_8
GLD xv, , Y0, Y, 0, Y1, Y, 0x20
.endm
.macro DLOAD_Y_4
GLD xv, , Y0, Y, 0
.endm
.macro DLOAD_Y_1
fld.d $f10, Y, 0
.endm
.macro DSTORE_Y_8
GST xv, , Y0, Y, 0, Y1, Y, 0x20
.endm
.macro DSTORE_Y_4
GST xv, , Y0, Y, 0
.endm
.macro DSTORE_Y_1
fst.d $f10, Y, 0
.endm
// Unable to use vector load/store ins
.macro DLOAD_Y_8_GAP
fld.d $f10, Y, 0
fldx.d $f13, Y, INC_Y
PTR_ALSL T0, INC_Y, Y, 1
fld.d $f14, T0, 0
fldx.d $f15, T0, INC_Y
PTR_ALSL T0, INC_Y, Y, 2
fld.d $f11, T0, 0
fldx.d $f17, T0, INC_Y
PTR_ADD T0, T0, INC_Y
PTR_ADD T0, T0, INC_Y
fld.d $f18, T0, 0
fldx.d $f19, T0, INC_Y
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3, Y1, A5, 1, Y1, A6, 2, Y1, A7, 3
.endm
.macro DLOAD_Y_4_GAP
fld.d $f10, Y, 0
fldx.d $f13, Y, INC_Y
PTR_ALSL T0, INC_Y, Y, 1
fld.d $f14, T0, 0
fldx.d $f15, T0, INC_Y
GINSVE0 xv, d, Y0, A1, 1, Y0, A2, 2, Y0, A3, 3
.endm
.macro DSTORE_Y_8_GAP
xvstelm.d Y0, Y, 0, 0
PTR_ADD T0, Y, INC_Y
xvstelm.d Y0, T0, 0, 1
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 2
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 3
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 0
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 1
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 2
PTR_ADD T0, T0, INC_Y
xvstelm.d Y1, T0, 0, 3
.endm
.macro DSTORE_Y_4_GAP
xvstelm.d Y0, Y, 0, 0
PTR_ADD T0, Y, INC_Y
xvstelm.d Y0, T0, 0, 1
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 2
PTR_ADD T0, T0, INC_Y
xvstelm.d Y0, T0, 0, 3
.endm
.macro DLOAD_X_8_GAP
xvldrepl.d X0, X, 0x00
PTR_ADD T0, X, INC_X
xvldrepl.d X1, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X2, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X3, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X4, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X5, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X6, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X7, T0, 0x00
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA, \
X4, X4, VALPHA, X5, X5, VALPHA, X6, X6, VALPHA, X7, X7, VALPHA
.endm
.macro DLOAD_X_4_GAP
xvldrepl.d X0, X, 0x00
PTR_ADD T0, X, INC_X
xvldrepl.d X1, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X2, T0, 0x00
PTR_ADD T0, T0, INC_X
xvldrepl.d X3, T0, 0x00
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA, X2, X2, VALPHA, X3, X3, VALPHA
.endm
.macro DLOAD_X_2_GAP
xvldrepl.d X0, X, 0x00
PTR_ADD T0, X, INC_X
xvldrepl.d X1, T0, 0x00
GMUL xvf, d, X0, X0, VALPHA, X1, X1, VALPHA
.endm
.macro DGEMV_N_8x8
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0, \
A8, PA4, 0, A9, PA4, 0, \
A10, PA5, 0, A11, PA5, 0, \
A12, PA6, 0, A13, PA6, 0, \
A14, PA7, 0, A15, PA7, 0
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \
Y0, A2, X1, Y0, Y1, A3, X1, Y1, \
Y0, A4, X2, Y0, Y1, A5, X2, Y1, \
Y0, A6, X3, Y0, Y1, A7, X3, Y1, \
Y0, A8, X4, Y0, Y1, A9, X4, Y1, \
Y0, A10, X5, Y0, Y1, A11, X5, Y1, \
Y0, A12, X6, Y0, Y1, A13, X6, Y1, \
Y0, A14, X7, Y0, Y1, A15, X7, Y1
.endm
.macro DGEMV_N_4x8
GLD_INC xv, , 0x20, A0, PA0, 0, \
A2, PA1, 0, \
A4, PA2, 0, \
A6, PA3, 0, \
A8, PA4, 0, \
A10, PA5, 0, \
A12, PA6, 0, \
A14, PA7, 0
GMADD xvf, d, Y0, A0, X0, Y0, \
Y0, A2, X1, Y0, \
Y0, A4, X2, Y0, \
Y0, A6, X3, Y0, \
Y0, A8, X4, Y0, \
Y0, A10, X5, Y0, \
Y0, A12, X6, Y0, \
Y0, A14, X7, Y0
.endm
.macro DGEMV_N_1x8
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0, \
$f20, PA4, 0, $f22, PA5, 0, $f24, PA6, 0, $f26, PA7, 0
GMADD f, d, $f10, $f12, $f2, $f10, \
$f10, $f14, $f3, $f10, \
$f10, $f16, $f4, $f10, \
$f10, $f18, $f5, $f10, \
$f10, $f20, $f6, $f10, \
$f10, $f22, $f7, $f10, \
$f10, $f24, $f8, $f10, \
$f10, $f26, $f9, $f10,
.endm
.macro DGEMV_N_8x4
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \
Y0, A2, X1, Y0, Y1, A3, X1, Y1, \
Y0, A4, X2, Y0, Y1, A5, X2, Y1, \
Y0, A6, X3, Y0, Y1, A7, X3, Y1
.endm
.macro DGEMV_N_4x4
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
GMADD xvf, d, Y0, A0, X0, Y0, Y0, A2, X1, Y0, \
Y0, A4, X2, Y0, Y0, A6, X3, Y0
.endm
.macro DGEMV_N_1x4
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0, $f16, PA2, 0, $f18, PA3, 0
GMADD f, d, $f10, $f12, $f2, $f10, $f10, $f14, $f3, $f10, \
$f10, $f16, $f4, $f10, $f10, $f18, $f5, $f10
.endm
.macro DGEMV_N_8x2
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0
GMADD xvf, d, Y0, A0, X0, Y0, Y1, A1, X0, Y1, \
Y0, A2, X1, Y0, Y1, A3, X1, Y1
.endm
.macro DGEMV_N_4x2
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0
GMADD xvf, d, Y0, A0, X0, Y0, \
Y0, A2, X1, Y0
.endm
.macro DGEMV_N_1x2
GLD_INC f, d, 0x08, $f12, PA0, 0, $f14, PA1, 0
GMADD f, d, $f10, $f12, $f2, $f10, \
$f10, $f14, $f3, $f10
.endm
.macro DGEMV_N_1x1
fld.d $f12, PA0, 0
PTR_ADDI PA0, PA0, 0x08
fmadd.d $f10, $f12, $f2, $f10
.endm
.macro DGEMV_N XW:req, X_8:req, X_4:req, X_2:req, X_1:req, Y_8:req, Y_4:req, Y_1:req
PTR_SRLI J, N, 3
beqz J, .L_\XW\()_N_7
PTR_SLLI K_LDA, LDA, 3
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L8:
DLOAD_\X_8
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_M_7
.align 5
.L_\XW\()_M_L8:
DLOAD_\Y_8
DGEMV_N_8x8
DSTORE_\Y_8
PTR_ADDI I, I, -1
PTR_ALSL Y, INC_Y, Y, 3
PTR_ADDI K, K, 8
bnez I, .L_\XW\()_M_L8
.L_\XW\()_M_7:
andi I, M, 4
beqz I, .L_\XW\()_M_3
DLOAD_\Y_4
DGEMV_N_4x8
DSTORE_\Y_4
PTR_ALSL Y, INC_Y, Y, 2
PTR_ADDI K, K, 4
.L_\XW\()_M_3:
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
DLOAD_\Y_1
DGEMV_N_1x8
DSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#else
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#endif
PTR_ALSL X, INC_X, X, 3
bnez J, .L_\XW\()_N_L8
.L_\XW\()_N_7:
andi J, N, 4
beqz J, .L_\XW\()_N_3
DLOAD_\X_4
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_N_4_M_7
.align 5
.L_\XW\()_N_4_M_L8:
DLOAD_\Y_8
DGEMV_N_8x4
DSTORE_\Y_8
PTR_ADDI I, I, -1
PTR_ADDI K, K, 8
PTR_ALSL Y, INC_Y, Y, 3
bnez I, .L_\XW\()_N_4_M_L8
.L_\XW\()_N_4_M_7:
andi I, M, 4
beqz I, .L_\XW\()_N_4_M_3
DLOAD_\Y_4
DGEMV_N_4x4
DSTORE_\Y_4
PTR_ALSL Y, INC_Y, Y, 2
PTR_ADDI K, K, 4
.L_\XW\()_N_4_M_3:
andi I, M, 3
beqz I, .L_\XW\()_N_4_M_END
.align 5
.L_\XW\()_N_4_M_L1:
DLOAD_\Y_1
DGEMV_N_1x4
DSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_4_M_L1
.L_\XW\()_N_4_M_END:
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M8
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
PTR_ALSL X, INC_X, X, 2
.L_\XW\()_N_3:
andi J, N, 2
beqz J, .L_\XW\()_N_1
DLOAD_\X_2
xor K, K, K
move Y, Y_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_N_2_M_7
.align 5
.L_\XW\()_N_2_M_L8:
DLOAD_\Y_8
DGEMV_N_8x2
DSTORE_\Y_8
PTR_ADDI I, I, -1
PTR_ADDI K, K, 8
PTR_ALSL Y, INC_Y, Y, 3
bnez I, .L_\XW\()_N_2_M_L8
.L_\XW\()_N_2_M_7:
andi I, M, 4
beqz I, .L_\XW\()_N_2_M_3
DLOAD_\Y_4
DGEMV_N_4x2
DSTORE_\Y_4
PTR_ALSL Y, INC_Y, Y, 2
PTR_ADDI K, K, 4
.L_\XW\()_N_2_M_3:
andi I, M, 3
beqz I, .L_\XW\()_N_2_M_END
.align 5
.L_\XW\()_N_2_M_L1:
DLOAD_\Y_1
DGEMV_N_1x2
DSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_2_M_L1
.L_\XW\()_N_2_M_END:
PTR_SLLI K_LDA, LDA, 1
PTR_SUB K_LDA, K_LDA, M8
PTR_ADD PA0, PA0, K_LDA
PTR_ADD PA1, PA1, K_LDA
PTR_ALSL X, INC_X, X, 1
.L_\XW\()_N_1:
andi J, N, 1
beqz J, .L_END
DLOAD_\X_1
xor K, K, K
move Y, Y_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
DLOAD_\Y_1
DGEMV_N_1x1
DSTORE_\Y_1
PTR_ADDI I, I, -1
PTR_ADD Y, Y, INC_Y
PTR_ADDI K, K, 1
bnez I, .L_\XW\()_N_1_M_L1
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 7, 24 + 4
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
PTR_SUB J, INC_Y, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
maskeqz J, K, J /* if(inc_y == 1) j = 0; else j = 1; */
PTR_ALSL I, I, J, 1
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
xvreplve0.d VALPHA, $xr0
move Y_ORG, Y
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#else
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0_0 - .L_GAP_TABLE
.hword .L_GAP_0_1 - .L_GAP_TABLE
.hword .L_GAP_1_0 - .L_GAP_TABLE
.hword .L_GAP_1_1 - .L_GAP_TABLE
.L_GAP_0_0: /* if (inc_x == 1) && (incy == 1) */
DGEMV_N GAP_0_0, X_8, X_4, X_2, X_1, Y_8, Y_4, Y_1
.L_GAP_0_1: /* if (inc_x == 1) && (incy != 1) */
DGEMV_N GAP_0_1, X_8, X_4, X_2, X_1, Y_8_GAP, Y_4_GAP, Y_1
.L_GAP_1_0: /* if (inc_x != 1) && (incy == 1) */
DGEMV_N GAP_1_0, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8, Y_4, Y_1
.L_GAP_1_1: /* if (inc_x != 1) && (incy != 1) */
DGEMV_N GAP_1_1, X_8_GAP, X_4_GAP, X_2_GAP, X_1, Y_8_GAP, Y_4_GAP, Y_1
.L_END:
pop_if_used 17 + 7, 24 + 4
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,468 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/07/17 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
*********************************************************************/
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha,
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
*/
#define M $r4
#define N $r5
#define ALPHA $f0
#define A $r7
#define LDA $r8
#define X $r9
#define INC_X $r10
#define Y $r11
#define INC_Y $r6
#define J $r12
#define I $r13
#define K $r14
#define PY0 $r14
#define X_ORG $r15
#define PY1 $r16
#define K_LDA $r17
#define PY2 $r18
#define T0 $r19
#define PA0 $r20
#define PA1 $r23
#define PA2 $r24
#define PA3 $r25
#define PA4 $r26
#define PA5 $r27
#define PA6 $r28
#define PA7 $r29
#define M8 $r30
#define VALPHA $xr0
#define X0 $xr1
#define X1 $xr2
#define A0 $xr3
#define A1 $xr4
#define A2 $xr5
#define A3 $xr6
#define A4 $xr7
#define A5 $xr8
#define A6 $xr9
#define A7 $xr10
#define A8 $xr11
#define A9 $xr12
#define A10 $xr13
#define A11 $xr14
#define A12 $xr15
#define A13 $xr16
#define A14 $xr17
#define A15 $xr18
#define TP0 $xr19
#define TP1 $xr20
#define TP2 $xr21
#define TP3 $xr22
#define TP4 $xr23
#define TP5 $xr24
#define TP6 $xr25
#define TP7 $xr26
#define Y0 $xr3
#define Y1 $xr4
#define Y2 $xr5
#define Y3 $xr6
#define Y4 $xr7
#define Y5 $xr8
#define Y6 $xr9
#define Y7 $xr10
.macro ZERO_Y8
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3, \
TP4, TP4, TP4, TP5, TP5, TP5, TP6, TP6, TP6, TP7, TP7, TP7
.endm
.macro ZERO_Y4
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
.endm
.macro ZERO_Y2
GXOR xv, v, TP0, TP0, TP0, TP1, TP1, TP1
.endm
.macro ZERO_Y1
GXOR xv, v, TP0, TP0, TP0
.endm
.macro DLOAD_X8
GLD xv, , X0, X, 0x00, X1, X, 0x20
.endm
.macro DLOAD_X4
GLD xv, , X0, X, 0x00
.endm
.macro DLOAD_X8_GAP
fld.d $f1, X, 0x00
fldx.d $f2, X, INC_X
PTR_ALSL T0, INC_X, X, 1
fld.d $f3, T0, 0x00
fldx.d $f4, T0, INC_X
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
PTR_ALSL T0, INC_X, X, 2
fld.d $f2, T0, 0x00
fldx.d $f3, T0, INC_X
PTR_ALSL T0, INC_X, T0, 1
fld.d $f4, T0, 0x00
fldx.d $f5, T0, INC_X
GINSVE0 xv, d, X1, A0, 1, X1, A1, 2, X1, A2, 3
.endm
.macro DLOAD_X4_GAP
fld.d $f1, X, 0x00
fldx.d $f2, X, INC_X
PTR_ALSL T0, INC_X, X, 1
fld.d $f3, T0, 0x00
fldx.d $f4, T0, INC_X
GINSVE0 xv, d, X0, X1, 1, X0, A0, 2, X0, A1, 3
.endm
.macro DGEMV_T_8x8
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0, \
A8, PA4, 0, A9, PA4, 0, \
A10, PA5, 0, A11, PA5, 0, \
A12, PA6, 0, A13, PA6, 0, \
A14, PA7, 0, A15, PA7, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \
TP1, A2, X0, TP1, TP1, A3, X1, TP1, \
TP2, A4, X0, TP2, TP2, A5, X1, TP2, \
TP3, A6, X0, TP3, TP3, A7, X1, TP3, \
TP4, A8, X0, TP4, TP4, A9, X1, TP4, \
TP5, A10, X0, TP5, TP5, A11, X1, TP5, \
TP6, A12, X0, TP6, TP6, A13, X1, TP6, \
TP7, A14, X0, TP7, TP7, A15, X1, TP7
.endm
.macro DGEMV_T_8x4
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0, \
A8, PA4, 0, A10, PA5, 0, A12, PA6, 0, A14, PA7, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \
TP2, A4, X0, TP2, TP3, A6, X0, TP3, \
TP4, A8, X0, TP4, TP5, A10, X0, TP5, \
TP6, A12, X0, TP6, TP7, A14, X0, TP7,
.endm
.macro DGEMV_T_4x8
GLD_INC xv, , 0x20, \
A0, PA0, 0, A1, PA0, 0, \
A2, PA1, 0, A3, PA1, 0, \
A4, PA2, 0, A5, PA2, 0, \
A6, PA3, 0, A7, PA3, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \
TP1, A2, X0, TP1, TP1, A3, X1, TP1, \
TP2, A4, X0, TP2, TP2, A5, X1, TP2, \
TP3, A6, X0, TP3, TP3, A7, X1, TP3
.endm
.macro DGEMV_T_4x4
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0, A4, PA2, 0, A6, PA3, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1, \
TP2, A4, X0, TP2, TP3, A6, X0, TP3
.endm
.macro DGEMV_T_2x8
GLD_INC xv, , 0x20, A0, PA0, 0, A1, PA0, 0, A2, PA1, 0, A3, PA1, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP0, A1, X1, TP0, \
TP1, A2, X0, TP1, TP1, A3, X1, TP1
.endm
.macro DGEMV_T_2x4
GLD_INC xv, , 0x20, A0, PA0, 0, A2, PA1, 0
GMADD xvf, d, TP0, A0, X0, TP0, TP1, A2, X0, TP1
.endm
.macro DGEMV_T XW:req X8:req, X4:req
PTR_SRLI J, N, 3
beqz J, .L_\XW\()_N_7
PTR_SLLI K_LDA, LDA, 3
PTR_SUB K_LDA, K_LDA, M8
.L_\XW\()_N_L8:
ZERO_Y8
move X, X_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_M_7
.align 5
.L_\XW\()_M_L8:
DLOAD_\X8
DGEMV_T_8x8
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 3
bnez I, .L_\XW\()_M_L8
.L_\XW\()_M_7:
andi I, M, 4
beqz I, .L_\XW\()_M_3
DLOAD_\X4
DGEMV_T_8x4
PTR_ALSL X, INC_X, X, 2
.L_\XW\()_M_3:
// Accumulated
GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3, Y4, TP4, \
Y5, TP5, Y6, TP6, Y7, TP7
andi I, M, 3
beqz I, .L_\XW\()_M_END
.align 5
.L_\XW\()_M_L1:
fld.d $f1, X, 0x00
fld.d $f11, PA0, 0x00
fld.d $f12, PA1, 0x00
fld.d $f13, PA2, 0x00
fld.d $f14, PA3, 0x00
fld.d $f15, PA4, 0x00
fld.d $f16, PA5, 0x00
fld.d $f17, PA6, 0x00
fld.d $f18, PA7, 0x00
#if __loongarch_grlen == 64
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
#else
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08, \
PA4, PA4, 0x08, PA5, PA5, 0x08, PA6, PA6, 0x08, PA7, PA7, 0x08
#endif
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6, \
$f7, $f15, $f1, $f7, $f8, $f16, $f1, $f8, $f9, $f17, $f1, $f9, $f10, $f18, $f1, $f10
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_M_L1
.L_\XW\()_M_END:
fld.d $f11, Y, 0x00
fldx.d $f12, Y, INC_Y
PTR_ALSL PY0, INC_Y, Y, 1
fld.d $f13, PY0, 0x00
fldx.d $f14, PY0, INC_Y
PTR_ALSL PY1, INC_Y, Y, 2
fld.d $f15, PY1, 0x00
fldx.d $f16, PY1, INC_Y
PTR_ALSL PY2, INC_Y, PY1, 1
fld.d $f17, PY2, 0x00
fldx.d $f18, PY2, INC_Y
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14, \
$f15, ALPHA, $f7, $f15, $f16, ALPHA, $f8, $f16, $f17, ALPHA, $f9, $f17, $f18, ALPHA, $f10, $f18
PTR_ADDI J, J, -1
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#else
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA, \
PA4, PA4, K_LDA, PA5, PA5, K_LDA, PA6, PA6, K_LDA, PA7, PA7, K_LDA
#endif
fst.d $f11, Y, 0x00
fstx.d $f12, Y, INC_Y
fst.d $f13, PY0, 0x00
fstx.d $f14, PY0, INC_Y
fst.d $f15, PY1, 0x00
fstx.d $f16, PY1, INC_Y
fst.d $f17, PY2, 0x00
fstx.d $f18, PY2, INC_Y
PTR_ALSL Y, INC_Y, Y, 3
bnez J, .L_\XW\()_N_L8
.L_\XW\()_N_7:
andi J, N, 4
beqz J, .L_\XW\()_N_3
ZERO_Y4
move X, X_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_N_4_M_7
.align 5
.L_\XW\()_N_4_M_L8:
DLOAD_\X8
DGEMV_T_4x8
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 3
bnez I, .L_\XW\()_N_4_M_L8
.L_\XW\()_N_4_M_7:
andi I, M, 4
beqz I, .L_\XW\()_N_4_M_3
DLOAD_\X4
DGEMV_T_4x4
PTR_ALSL X, INC_X, X, 2
.L_\XW\()_N_4_M_3:
// Accumulated
GACC xvf, d, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
andi I, M, 3
beqz I, .L_\XW\()_N_4_M_END
.align 5
.L_\XW\()_N_4_M_L1:
fld.d $f1, X, 0x00
GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00, $f13, PA2, 0x00, $f14, PA3, 0x00
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4, $f5, $f13, $f1, $f5, $f6, $f14, $f1, $f6
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_N_4_M_L1
.L_\XW\()_N_4_M_END:
fld.d $f11, Y, 0x00
fldx.d $f12, Y, INC_Y
PTR_ALSL PY0, INC_Y, Y, 1
fld.d $f13, PY0, 0x00
fldx.d $f14, PY0, INC_Y
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12, $f13, ALPHA, $f5, $f13, $f14, ALPHA, $f6, $f14
PTR_SLLI K_LDA, LDA, 2
PTR_SUB K_LDA, K_LDA, M8
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#else
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
#endif
fst.d $f11, Y, 0x00
fstx.d $f12, Y, INC_Y
fst.d $f13, PY0, 0x00
fstx.d $f14, PY0, INC_Y
PTR_ALSL Y, INC_Y, Y, 2
.L_\XW\()_N_3:
andi J, N, 2
beqz J, .L_\XW\()_N_1
ZERO_Y2
move X, X_ORG
PTR_SRLI I, M, 3
beqz I, .L_\XW\()_N_2_M_7
.align 5
.L_\XW\()_N_2_M_L8:
DLOAD_\X8
DGEMV_T_2x8
PTR_ADDI I, I, -1
PTR_ALSL X, INC_X, X, 3
bnez I, .L_\XW\()_N_2_M_L8
.L_\XW\()_N_2_M_7:
andi I, M, 4
beqz I, .L_\XW\()_N_2_M_3
DLOAD_\X4
DGEMV_T_2x4
PTR_ALSL X, INC_X, X, 2
.L_\XW\()_N_2_M_3:
// Accumulated
GACC xvf, d, Y0, TP0, Y1, TP1
andi I, M, 3
beqz I, .L_\XW\()_N_2_M_END
.align 5
.L_\XW\()_N_2_M_L1:
fld.d $f1, X, 0x00
GLD_INC f, d, 0x08, $f11, PA0, 0x00, $f12, PA1, 0x00
GMADD f, d, $f3, $f11, $f1, $f3, $f4, $f12, $f1, $f4
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
bnez I, .L_\XW\()_N_2_M_L1
.L_\XW\()_N_2_M_END:
fld.d $f11, Y, 0x00
fldx.d $f12, Y, INC_Y
GMADD f, d, $f11, ALPHA, $f3, $f11, $f12, ALPHA, $f4, $f12
PTR_SLLI K_LDA, LDA, 1
PTR_SUB K_LDA, K_LDA, M8
#if __loongarch_grlen == 64
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#else
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA
#endif
fst.d $f11, Y, 0x00
fstx.d $f12, Y, INC_Y
PTR_ALSL Y, INC_Y, Y, 1
.L_\XW\()_N_1:
andi J, N, 1
beqz J, .L_END
ZERO_Y1
move X, X_ORG
move I, M
beqz I, .L_END
.align 5
.L_\XW\()_N_1_M_L1:
fld.d $f3, PA0, 0x00
fld.d $f1, X, 0x00
fmadd.d $f19, $f3, $f1, $f19
PTR_ADDI I, I, -1
PTR_ADD X, X, INC_X
PTR_ADDI PA0, PA0, 0x08
bnez I, .L_\XW\()_N_1_M_L1
fld.d $f3, Y, 0x00
fmadd.d $f3, ALPHA, $f19, $f3
fst.d $f3, Y, 0x00
b .L_END
.endm
PROLOGUE
PTR_LD INC_Y, $sp, 0
push_if_used 17 + 8, 24 + 3
PTR_ADDI K, $r0, 0x01
PTR_SUB I, INC_X, K
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
xvreplve0.d VALPHA, $xr0
move X_ORG, X
move PA0, A
#if __loongarch_grlen == 64
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#else
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA, \
PA5, PA4, LDA, PA6, PA5, LDA, PA7, PA6, LDA
#endif
la.local T0, .L_GAP_TABLE
PTR_ALSL I, I, T0, 1
ld.h K, I, 0
PTR_ADD T0, T0, K
jirl $r0, T0, 0
.L_GAP_TABLE:
.hword .L_GAP_0 - .L_GAP_TABLE
.hword .L_GAP_1 - .L_GAP_TABLE
.L_GAP_0: /* if (incx == 1) */
DGEMV_T GAP_0, X8, X4
.L_GAP_1: /* if (incx != 1) */
DGEMV_T GAP_1, X8_GAP, X4_GAP
.L_END:
pop_if_used 17 + 8, 24 + 3
jirl $r0, $r1, 0x0
EPILOGUE

View File

@ -0,0 +1,313 @@
/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#if __loongarch_grlen == 64
#define LA_REG int64_t
#define REG_SIZE 8
#define REG_LOG 3
#define PTR_ADDI addi.d
#define PTR_ADD add.d
#define PTR_SUB sub.d
#define PTR_LD ld.d
#define PTR_ST st.d
#define PTR_SLLI slli.d
#define PTR_SRLI srli.d
#define PTR_ALSL alsl.d
#else
#define LA_REG int32_t
#define REG_SIZE 4
#define REG_LOG 2
#define PTR_ADDI addi.w
#define PTR_ADD add.w
#define PTR_SUB sub.w
#define PTR_LD ld.w
#define PTR_ST st.w
#define PTR_SLLI slli.w
#define PTR_SRLI srli.w
#define PTR_ALSL alsl.w
#endif
#if __loongarch_frlen == 64
#define FREG_SIZE 8
#define FREG_LOG 3
#define PTR_FLD fld.d
#define PTR_FST fst.d
#else
#define FREG_SIZE 4
#define FREG_LOG 2
#define PTR_FLD fld.s
#define PTR_FST fst.s
#endif
// The max registers available to the user which
// do not need to be preserved across calls.
// Ref: https://loongson.github.io/LoongArch-Documentation/LoongArch-ELF-ABI-CN.html
#define MAX_INT_CALLER_SAVED 17
#define MAX_FP_CALLER_SAVED 24
.altmacro // Enable alternate macro mode
.macro push_if_used regs, fregs
.if \regs > MAX_INT_CALLER_SAVED
PTR_ADDI $sp, $sp, -((\regs - MAX_INT_CALLER_SAVED) << REG_LOG)
push_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
.endif
.if \fregs > MAX_FP_CALLER_SAVED
PTR_ADDI $sp, $sp, -((\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG)
push_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
.endif
.endm // End push_if_used
.macro pop_if_used regs, fregs
.if \fregs > MAX_FP_CALLER_SAVED
pop_fregs 0, \fregs - MAX_FP_CALLER_SAVED - 1
PTR_ADDI $sp, $sp, (\fregs - MAX_FP_CALLER_SAVED) << FREG_LOG
.endif
.if \regs > MAX_INT_CALLER_SAVED
pop_regs 0, \regs - MAX_INT_CALLER_SAVED - 1
PTR_ADDI $sp, $sp, (\regs - MAX_INT_CALLER_SAVED) << REG_LOG
.endif
.endm // End pop_if_used
.macro push_regs from, to
PTR_ST $s\()\from, $sp, \from << REG_LOG
.if \to - \from
push_regs %from + 1, \to
.endif
.endm // End push_regs
.macro pop_regs from, to
PTR_LD $s\()\from, $sp, \from << REG_LOG
.if \to - \from
pop_regs %from + 1, \to
.endif
.endm // End pop_regs
.macro push_fregs from, to
PTR_FST $fs\()\from, $sp, \from << FREG_LOG
.if \to - \from
push_fregs %from + 1, \to
.endif
.endm // End push_fregs
.macro pop_fregs from, to
PTR_FLD $fs\()\from, $sp, \from << FREG_LOG
.if \to - \from
pop_fregs %from + 1, \to
.endif
.endm // End pop_fregs
//
// Instruction Related Macros
//
// GLD
//
.macro GLD pre_op:req, suf_op=0, out:req, src:req, offset:req/* imm */, more:vararg
.ifeqs "\suf_op", "0"
\pre_op\()ld \out, \src, \offset
.else
\pre_op\()ld.\suf_op \out, \src, \offset
.endif
.ifnb \more
GLD \pre_op, \suf_op, \more
.endif
.endm
//
// GLD_INC
//
.macro GLD_INC pre_op:req, suf_op=0, inc:req, out:req, src:req, offset:req/* imm */, more:vararg
.ifeqs "\suf_op", "0"
\pre_op\()ld \out, \src, \offset
.else
\pre_op\()ld.\suf_op \out, \src, \offset
.endif
PTR_ADDI \src, \src, \inc
.ifnb \more
GLD_INC \pre_op, \suf_op, \inc, \more
.endif
.endm
//
// GLDX is same as GLD except the stride is a register
//
.macro GLDX pre_op:req, suf_op=0, out:req, src:req, offset:req/* reg */, more:vararg
.ifeqs "\suf_op", "0"
\pre_op\()ldx \out, \src, \offset
.else
\pre_op\()ldx.\suf_op \out, \src, \offset
.endif
.ifnb \more
GLDX \pre_op, \suf_op, \more
.endif
.endm
//
// GLDREPL
//
.macro GLDREPL pre_op:req, suf_op:req, out:req, src:req, offset:req/* imm */, more:vararg
\pre_op\()ldrepl.\suf_op \out, \src, \offset
.ifnb \more
GLDREPL \pre_op, \suf_op, \more
.endif
.endm
//
// GST
//
.macro GST pre_op:req, suf_op=0, src:req, dst:req, offset:req/* imm */, more:vararg
.ifeqs "\suf_op", "0"
\pre_op\()st \src, \dst, \offset
.else
\pre_op\()st.\suf_op \src, \dst, \offset
.endif
.ifnb \more
GST \pre_op, \suf_op, \more
.endif
.endm
//
// GMUL
//
.macro GMUL pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()mul.\suf_op \out, \in0, \in1
.ifnb \more
GMUL \pre_op, \suf_op, \more
.endif
.endm
//
// GMADD
//
.macro GMADD pre_op, suf_op:req, out:req, in0:req, in1:req, in2:req, more:vararg
\pre_op\()madd.\suf_op \out, \in0, \in1, \in2
.ifnb \more
GMADD \pre_op, \suf_op, \more
.endif
.endm
//
// GADD
//
.macro GADD pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()add.\suf_op \out, \in0, \in1
.ifnb \more
GADD \pre_op, \suf_op, \more
.endif
.endm
//
// GADDI
//
.macro GADDI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()addi.\suf_op \out, \in0, \in1
.ifnb \more
GADDI \pre_op, \suf_op, \more
.endif
.endm
//
// GSLLI
//
.macro GSLLI pre_op, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()slli.\suf_op \out, \in0, \in1
.ifnb \more
GSLLI \pre_op, \suf_op, \more
.endif
.endm
//
// GINSVE0
//
.macro GINSVE0 pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()insve0.\suf_op \out, \in0, \in1
.ifnb \more
GINSVE0 \pre_op, \suf_op, \more
.endif
.endm
//
// GXOR
//
.macro GXOR pre_op:req, suf_op:req, out:req, in0:req, in1:req, more:vararg
\pre_op\()xor.\suf_op \out, \in0, \in1
.ifnb \more
GXOR \pre_op, \suf_op, \more
.endif
.endm
//
// Compound instructions
//
// GACC: Accumulate the values of vector registers
//
.macro GACC pre_op:req, suf_op:req, out:req, in:req, more:vararg
.ifeqs "\pre_op", "xvf"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifeqs "\suf_op", "s"
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.ifeqs "\pre_op", "vf"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifeqs "\suf_op", "s"
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.ifeqs "\pre_op", "xv"
xvpermi.q \out, \in, 0x01
\pre_op\()add.\suf_op \in, \out, \in
xvpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "d"
xvpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "w"
xvpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "h"
xvpackod.b \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.endif
.endif
.ifeqs "\pre_op", "v"
vpackod.d \out, \in, \in
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "d"
vpackod.w \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "w"
vpackod.h \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.ifnc "\suf_op", "h"
vpackod.b \in, \out, \out
\pre_op\()add.\suf_op \out, \out, \in
.endif
.endif
.endif
.endif
.ifnb \more
GACC \pre_op, \suf_op, \more
.endif
.endm