291 lines
8.8 KiB
ArmAsm
291 lines
8.8 KiB
ArmAsm
/*******************************************************************************
|
|
Copyright (c) 2024, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*******************************************************************************/
|
|
#define ASSEMBLER
|
|
|
|
#include "common.h"
|
|
#include "loongarch64_asm.S"
|
|
|
|
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i,
|
|
* FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
|
|
*/
|
|
#define M $r4
|
|
#define N $r5
|
|
#define ALPHA_R $f0
|
|
#define ALPHA_I $f1
|
|
#define A $r7
|
|
#define LDA $r8
|
|
#define X $r9
|
|
#define INC_X $r10
|
|
#define Y $r11
|
|
#define INC_Y $r6
|
|
|
|
#define J $r12
|
|
#define I $r13
|
|
#define K $r14
|
|
#define PY0 $r14
|
|
#define X_ORG $r15
|
|
#define PY1 $r16
|
|
#define K_LDA $r17
|
|
#define PY2 $r18
|
|
#define T0 $r19
|
|
#define PA0 $r20
|
|
#define PA1 $r23
|
|
#define PA2 $r24
|
|
#define PA3 $r25
|
|
#define PA4 $r26
|
|
#define PA5 $r27
|
|
#define PA6 $r28
|
|
#define PA7 $r29
|
|
#define M8 $r30
|
|
|
|
#define VALPHA $vr0
|
|
#define X0 $vr1
|
|
#define X1 $vr2
|
|
#define A0 $vr3
|
|
#define A1 $vr4
|
|
#define A2 $vr5
|
|
#define A3 $vr6
|
|
#define A4 $vr7
|
|
#define A5 $vr8
|
|
#define A6 $vr9
|
|
#define A7 $vr10
|
|
#define A8 $vr11
|
|
#define A9 $vr12
|
|
#define A10 $vr13
|
|
#define A11 $vr14
|
|
#define A12 $vr15
|
|
#define A13 $vr16
|
|
#define A14 $vr17
|
|
#define A15 $vr18
|
|
#define TP0 $vr19
|
|
#define TP1 $vr20
|
|
#define TP2 $vr21
|
|
#define TP3 $vr22
|
|
#define TP4 $vr23
|
|
#define TP5 $vr24
|
|
#define TP6 $vr25
|
|
#define TP7 $vr26
|
|
#define TMP0 $vr27
|
|
#define TMP1 $vr28
|
|
#define TMP2 $vr29
|
|
#define Y0 $vr3
|
|
#define Y1 $vr4
|
|
#define Y2 $vr5
|
|
#define Y3 $vr6
|
|
#define Y4 $vr7
|
|
#define Y5 $vr8
|
|
#define Y6 $vr9
|
|
#define Y7 $vr10
|
|
|
|
#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) )
|
|
#define GXCONJ1 0
|
|
#define GCONJ1 0
|
|
#else
|
|
#define GXCONJ1 1
|
|
#define GCONJ1 0
|
|
#endif
|
|
|
|
#if !defined(XCONJ)
|
|
#define GXCONJ2 0
|
|
#define GCONJ2 0
|
|
#else
|
|
#define GXCONJ2 0
|
|
#define GCONJ2 1
|
|
#endif
|
|
|
|
.macro ZERO_Y4
|
|
GXOR v, v, TP0, TP0, TP0, TP1, TP1, TP1, TP2, TP2, TP2, TP3, TP3, TP3
|
|
.endm
|
|
|
|
.macro ZERO_Y1
|
|
GXOR v, v, TP0, TP0, TP0
|
|
.endm
|
|
|
|
.macro CLOAD_X4
|
|
GLD v, , X0, X, 0x00, X1, X, 0x10
|
|
.endm
|
|
|
|
.macro CLOAD_X4_GAP
|
|
fld.d $f1, X, 0x00
|
|
fldx.d $f3, X, INC_X
|
|
PTR_ALSL T0, INC_X, X, 1
|
|
fld.d $f2, T0, 0x00
|
|
fldx.d $f4, T0, INC_X
|
|
vpackev.d X0, A0, X0
|
|
vpackev.d X1, A1, X1
|
|
.endm
|
|
|
|
.macro CGEMV_T_4x4
|
|
GLD_INC v, , 0x10, \
|
|
A0, PA0, 0, A1, PA0, 0, \
|
|
A2, PA1, 0, A3, PA1, 0, \
|
|
A4, PA2, 0, A5, PA2, 0, \
|
|
A6, PA3, 0, A7, PA3, 0
|
|
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
|
vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2, TP0, A1, X1, TP0, TMP0, TMP1, TMP2, \
|
|
TP1, A2, X0, TP1, TMP0, TMP1, TMP2, TP1, A3, X1, TP1, TMP0, TMP1, TMP2, \
|
|
TP2, A4, X0, TP2, TMP0, TMP1, TMP2, TP2, A5, X1, TP2, TMP0, TMP1, TMP2, \
|
|
TP3, A6, X0, TP3, TMP0, TMP1, TMP2, TP3, A7, X1, TP3, TMP0, TMP1, TMP2
|
|
.endm
|
|
|
|
.macro CGEMV_T_LSX XW:req, X4:req
|
|
PTR_SRLI J, N, 2
|
|
beqz J, .L_\XW\()_N_3
|
|
PTR_SLLI K_LDA, LDA, 2
|
|
PTR_SUB K_LDA, K_LDA, M8
|
|
.L_\XW\()_N_L4:
|
|
ZERO_Y4
|
|
move X, X_ORG
|
|
PTR_SRLI I, M, 2
|
|
beqz I, .L_\XW\()_M_3
|
|
.align 5
|
|
.L_\XW\()_M_L4:
|
|
CLOAD_\X4
|
|
CGEMV_T_4x4
|
|
PTR_ADDI I, I, -1
|
|
PTR_ALSL X, INC_X, X, 2
|
|
bnez I, .L_\XW\()_M_L4
|
|
.L_\XW\()_M_3:
|
|
// Accumulated
|
|
GCOMPLEXACC vf, s, Y0, TP0, Y1, TP1, Y2, TP2, Y3, TP3
|
|
andi I, M, 3
|
|
beqz I, .L_\XW\()_M_END
|
|
.align 5
|
|
.L_\XW\()_M_L1:
|
|
fld.d $f1, X, 0x00
|
|
fld.d $f11, PA0, 0x00
|
|
fld.d $f12, PA1, 0x00
|
|
fld.d $f13, PA2, 0x00
|
|
fld.d $f14, PA3, 0x00
|
|
#if __loongarch_grlen == 64
|
|
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
|
|
#elif __loongarch_grlen == 32
|
|
GADDI , w, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
|
|
#else
|
|
GADDI , d, PA0, PA0, 0x08, PA1, PA1, 0x08, PA2, PA2, 0x08, PA3, PA3, 0x08
|
|
#endif
|
|
|
|
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
|
vf, s, A0, A8, X0, A0, TMP0, TMP1, TMP2, A1, A9, X0, A1, TMP0, TMP1, TMP2, \
|
|
A2, A10, X0, A2, TMP0, TMP1, TMP2, A3, A11, X0, A3, TMP0, TMP1, TMP2
|
|
|
|
PTR_ADDI I, I, -1
|
|
PTR_ADD X, X, INC_X
|
|
bnez I, .L_\XW\()_M_L1
|
|
.L_\XW\()_M_END:
|
|
fld.d $f11, Y, 0x00
|
|
fldx.d $f12, Y, INC_Y
|
|
PTR_ALSL PY0, INC_Y, Y, 1
|
|
fld.d $f13, PY0, 0x00
|
|
fldx.d $f14, PY0, INC_Y
|
|
|
|
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
|
vf, s, A8, VALPHA, A0, A8, TMP0, TMP1, TMP2, A9, VALPHA, A1, A9, TMP0, TMP1, TMP2,\
|
|
A10, VALPHA, A2, A10, TMP0, TMP1, TMP2, A11, VALPHA, A3, A11, TMP0, TMP1, TMP2
|
|
|
|
PTR_ADDI J, J, -1
|
|
#if __loongarch_grlen == 64
|
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
|
#elif __loongarch_grlen == 32
|
|
GADD , w, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
|
#else
|
|
GADD , d, PA0, PA0, K_LDA, PA1, PA1, K_LDA, PA2, PA2, K_LDA, PA3, PA3, K_LDA
|
|
#endif
|
|
fst.d $f11, Y, 0x00
|
|
fstx.d $f12, Y, INC_Y
|
|
fst.d $f13, PY0, 0x00
|
|
fstx.d $f14, PY0, INC_Y
|
|
PTR_ALSL Y, INC_Y, Y, 2
|
|
bnez J, .L_\XW\()_N_L4
|
|
.L_\XW\()_N_3:
|
|
andi J, N, 3
|
|
beqz J, .L_END
|
|
PTR_SUB K_LDA, LDA, M8
|
|
.L_\XW\()_N_1:
|
|
ZERO_Y1
|
|
move X, X_ORG
|
|
move I, M
|
|
beqz I, .L_END
|
|
.align 5
|
|
.L_\XW\()_N_1_M_L1:
|
|
fld.d $f3, PA0, 0x00
|
|
fld.d $f1, X, 0x00
|
|
GCOMPLEXMADD GXCONJ1, GCONJ1, \
|
|
vf, s, TP0, A0, X0, TP0, TMP0, TMP1, TMP2
|
|
PTR_ADDI I, I, -1
|
|
PTR_ADD X, X, INC_X
|
|
PTR_ADDI PA0, PA0, 0x08
|
|
bnez I, .L_\XW\()_N_1_M_L1
|
|
.L_\XW\()_N_1_M_END:
|
|
PTR_ADDI J, J, -1
|
|
fld.d $f3, Y, 0x00
|
|
GCOMPLEXMADD GXCONJ2, GCONJ2, \
|
|
vf, s, A0, VALPHA, TP0, A0, TMP0, TMP1, TMP2
|
|
fst.d $f3, Y, 0x00
|
|
PTR_ADD PA0, PA0, K_LDA
|
|
PTR_ADD Y, Y, INC_Y
|
|
bnez J, .L_\XW\()_N_1
|
|
|
|
b .L_END
|
|
.endm
|
|
|
|
PROLOGUE
|
|
PTR_LD INC_Y, $sp, 0
|
|
push_if_used 8, 6
|
|
PTR_ADDI K, $r0, 0x01
|
|
PTR_SUB I, INC_X, K
|
|
maskeqz I, K, I /* if(inc_x == 1) I = 0; else I = 1; */
|
|
GSLLI , d, LDA, LDA, 3, INC_X, INC_X, 3, INC_Y, INC_Y, 3, M8, M, 3
|
|
// Init VALPHA
|
|
vpackev.w $vr0, $vr1, $vr0
|
|
vpackev.d VALPHA, $vr0, $vr0
|
|
move X_ORG, X
|
|
move PA0, A
|
|
#if __loongarch_grlen == 64
|
|
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
|
#elif __loongarch_grlen == 32
|
|
GADD , w, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
|
#else
|
|
GADD , d, PA1, PA0, LDA, PA2, PA1, LDA, PA3, PA2, LDA, PA4, PA3, LDA
|
|
#endif
|
|
la.local T0, .L_GAP_TABLE
|
|
PTR_ALSL I, I, T0, 1
|
|
ld.h K, I, 0
|
|
PTR_ADD T0, T0, K
|
|
jirl $r0, T0, 0
|
|
.L_GAP_TABLE:
|
|
.hword .L_GAP_0 - .L_GAP_TABLE
|
|
.hword .L_GAP_1 - .L_GAP_TABLE
|
|
.L_GAP_0: /* if (incx == 1) */
|
|
CGEMV_T_LSX GAP_0, X4
|
|
.L_GAP_1: /* if (incx != 1) */
|
|
CGEMV_T_LSX GAP_1, X4_GAP
|
|
.L_END:
|
|
pop_if_used 8, 6
|
|
jirl $r0, $r1, 0x0
|
|
EPILOGUE
|