OpenBLAS/kernel/loongarch64/dtrsm_kernel_LN_16x4_lasx.S

1367 lines
43 KiB
ArmAsm
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
#define ASSEMBLER
#include "common.h"
#include "loongarch64_asm.S"
/*********************************************************************
* 2023/07/26 guxiwei
* UTEST : OK
* CTEST : OK
* TEST : OK
*
*
*********************************************************************/
/* int CNAME(BLASLONG m, BLASLONG n, BLASLONG k, FLOAT dummy1, FLOAT *a, FLOAT *b,
* FLOAT *c, BLASLONG ldc, BLASLONG offset)
*/
#define M $r4 // param 1: bm
#define N $r5 // param 2: bn
#define K $r6 // param 3: bk
#define A $r7 // param 5: ba
#define B $r8 // param 6: bb
#define C $r9 // param 7: bc
#define LDC $r10 // param 8: ldc
#define OFFSET $r11 // param 9: offset
/* Cycle control parameters */
#define I $r13
#define J $r14
#define L $r15
#define TL $r16
/* Matrix address */
#define A0 $r17
#define B0 $r18
#define C0 $r19
#define C1 $r20
#define C2 $r23
#define C3 $r24
#define T0 $r25
#define T1 $r26
#define T2 $r27
#define KK $r28
#define AA $r29
#define CC $r30
#undef ZERO
#define ZERO $r0
#define U0 $xr0
#define U1 $xr1
#define U2 $xr2
#define U3 $xr3
#define U4 $xr4
#define U5 $xr5
#define U6 $xr6
#define U7 $xr7
#define U8 $xr8
#define U9 $xr9
#define U10 $xr10
#define U11 $xr11
#define U12 $xr12
#define U13 $xr13
#define U14 $xr14
#define U15 $xr15
#define D0 $xr16
#define D1 $xr17
#define D2 $xr18
#define D3 $xr19
#define D4 $xr20
#define D5 $xr21
#define D6 $xr22
#define D7 $xr23
#define D8 $xr24
#define D9 $xr25
#define D10 $xr26
#define D11 $xr27
#define D12 $xr28
#define D13 $xr29
#define D14 $xr30
#define D15 $xr31
/* Prefetch interval */
#define A_PRE 0x400
#define B_PRE 0x100
#include "dtrsm_kernel_macro.S"
// By integrating the dgemm and dsolve processes, the following advantages can be obtained:
// 1. Avoid the overhead of function calls (by not invoking dgemm_kernel)
// 2. Reduce the storage and retrieval of C data
// 3. Vectorization of dsolve
// GEMM_UNROLL_M x DGEMM_UNROLL_N is 16x4, which is a fairly large size.
// To achieve finer-grained optimization, 15 scenarios have been addressed:
// 16x4, 16x2, 16x1, 8x4, 8x2, 8x1, 4x4, 4x2, 4x1, 2x4, 2x2, 2x1, 1x4, 1x2, 1x1.
.macro dsolve_16 N
// if N = 4 the data layout of C is as follows:
// U0 U1 U2 U3
// U4 U5 U6 U7
// U8 U9 U10 U11
// U12 U13 U14 U15
// if N = 2 the dat layout of C is as follows:
// U0 U1 U2 U3
// U4 U5 U6 U7
// if N = 1 the dat layout of C is as follows:
// U0 U1 U2 U3
// The matrix A has dimensions of 16x16, and
// it will be divided into 4 segments for processing.
#define G12 U3
#define G13 U7
#define G14 U11
#define G15 U15
GTRANSPOSE4x4_D U3, U7, U11, U15, G12, G13, G14, G15, D0, D1
// A
// G12 G13 G14 G15
// -----------------
// 204 | D9
// 220 221 | D8 D7
// 236 237 238 | D6 D5 D4
// 252 253 254 255 | D3 D2 D1 D0
PTR_ADDI T0, A0, 252 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 236 * 8
GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
PTR_ADDI T0, A0, 220 * 8
GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
PTR_ADDI T0, A0, 204 * 8
GLDREPL xv, d, D9, T0, 0
xvfmul.d G15, G15, D0
GNMSUB xvf, d, G14, G15, D1, G14
xvfmul.d G14, G14, D4
GNMSUB xvf, d, G13, G15, D2, G13, G13, G14, D5, G13
xvfmul.d G13, G13, D7
GNMSUB xvf, d, G12, G15, D3, G12, G12, G14, D6, G12, G12, G13, D8, G12
xvfmul.d G12, G12, D9
// Store B
.if \N == 4
// x x x x ... x x x x
// x x x x ... x x x x
// x x x x ... x x x x
// b48 b49 b50 b51 ... b60 b61 b62 b63
GST xv, , G12, B0, 48 * 8, G13, B0, 52 * 8, G14, B0, 56 * 8, G15, B0, 60 * 8
.elseif \N == 2
// x x x x ... x x x x
// x x x x ... x x x x
// x x x x ... x x x x
// b24 b25 b26 b27 b28 b29 b30 b31
GST v, , $vr3, B0, 24 * 8, $vr7, B0, 26 * 8, $vr11, B0, 28 * 8, $vr15, B0, 30 * 8
.elseif \N == 1
// x x x x
// x x x x
// x x x x
// b12 b13 b14 b15
GST f, d, $f3, B0, 12 * 8, $f7, B0, 13 * 8, $f11, B0, 14 * 8, $f15, B0, 15 * 8
.endif
// Transpose G15 G14 G13 G12
GTRANSPOSE4x4_D G12, G13, G14, G15, D0, D1, D2, D3, D4, D5
// Store C
.if \N == 4
// x x x x ... c12 c13 c14 c15
// x x x x ... c28 c29 c30 c31
// x x x x ... c44 c45 c46 c47
// x x x x ... c60 c61 c62 c63
GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8, D2, C2, 12 * 8, D3, C3, 12 * 8
.elseif \N == 2
// x x x x ... c12 c13 c14 c15
// x x x x ... c28 c29 c30 c31
GST xv, , D0, C0, 12 * 8, D1, C1, 12 * 8
.elseif \N == 1
// Store C
// x x x x ... c12 c13 c14 c15
GST xv, , D0, C0, 12 * 8
.endif
#define G8 U2
#define G9 U6
#define G10 U10
#define G11 U14
GTRANSPOSE4x4_D U2, U6, U10, U14, G8, G9, G10, G11, D0, D1
// A
// G8 G9 G10 G11
// -----------------
// 136 | D9
// 152 153 | D8 D7
// 168 169 170 | D6 D5 D4
// 184 185 186 187 | D3 D2 D1 D0
// 200 201 202 203 | D15 D14 D13 D12
// 216 217 218 219 | D11 D10 D9 D8
// 232 233 234 235 | D7 D6 D5 D4
// 248 249 250 251 | D3 D2 D1 D0
PTR_ADDI T0, A0, 248 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 232 * 8
GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
PTR_ADDI T0, A0, 216 * 8
GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
PTR_ADDI T0, A0, 200 * 8
GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
GNMSUB xvf, d, G11, G15, D0, G11, G10, G15, D1, G10, G9, G15, D2, G9, G8, G15, D3, G8, \
G11, G14, D4, G11, G10, G14, D5, G10, G9, G14, D6, G9, G8, G14, D7, G8, \
G11, G13, D8, G11, G10, G13, D9, G10, G9, G13, D10, G9, G8, G13, D11, G8, \
G11, G12, D12, G11, G10, G12, D13, G10, G9, G12, D14, G9, G8, G12, D15, G8
PTR_ADDI T0, A0, 184 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 168 * 8
GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
PTR_ADDI T0, A0, 152 * 8
GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
PTR_ADDI T0, A0, 136 * 8
GLDREPL xv, d, D9, T0, 0
xvfmul.d G11, G11, D0
GNMSUB xvf, d, G10, G11, D1, G10, G9, G11, D2, G9, G8, G11, D3, G8
xvfmul.d G10, G10, D4
GNMSUB xvf, d, G9, G10, D5, G9, G8, G10, D6, G8
xvfmul.d G9, G9, D7
GNMSUB xvf, d, G8, G9, D8, G8
xvfmul.d G8, G8, D9
// Store B
.if \N == 4
// x x x x ... x x x x
// x x x x ... x x x x
// b32 b33 b34 b34 ... b44 b45 b46 b47
// b48 b49 b50 b51 ... b60 b61 b62 b63
GST xv, , G8, B0, 32 * 8, G9, B0, 36 * 8, G10, B0, 40 * 8, G11, B0, 44 * 8
.elseif \N == 2
// x x x x ... x x x x
// x x x x ... x x x x
// b16 b17 b18 b19 b20 b21 b22 b23
// b24 b25 b26 b27 b28 b29 b30 b31
GST v, , $vr2, B0, 16 * 8, $vr6, B0, 18 * 8, $vr10, B0, 20 * 8, $vr14, B0, 22 * 8
.elseif \N == 1
// x x x x
// x x x x
// b8 b9 b10 b11
// b12 b13 b14 b15
GST f, d, $f2, B0, 8 * 8, $f6, B0, 9 * 8, $f10, B0, 10 * 8, $f14, B0, 11 * 8
.endif
// Transpose G11 G10 G9 G8
GTRANSPOSE4x4_D G8, G9, G10, G11, D0, D1, D2, D3, D4, D5
// Store C
.if \N == 4
// x x x x ... c8 c9 c10 c11 c12 c13 c14 c15
// x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
// x x x x ... c40 c41 c42 c43 c44 c45 c46 c47
// x x x x ... c56 c57 c58 c59 c60 c61 c62 c63
GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8, D2, C2, 8 * 8, D3, C3, 8 * 8
.elseif \N == 2
// x x x x ... c8 c9 c10 c11 c12 c13 c14 c15
// x x x x ... c24 c25 c26 c27 c28 c29 c30 c31
GST xv, , D0, C0, 8 * 8, D1, C1, 8 * 8
.elseif \N == 1
// x x x x ... c8 c9 c10 c11 c12 c13 c14 c15
GST xv, , D0, C0, 8 * 8
.endif
#define G4 U1
#define G5 U5
#define G6 U9
#define G7 U13
GTRANSPOSE4x4_D U1, U5, U9, U13, G4, G5, G6, G7, D0, D1
// A
// G4 G5 G6 G7
// ------------------
// 68 | D9
// 84 85 | D8 D7
// 100 101 102 | D6 D5 D4
// 116 117 118 119 | D3 D2 D1 D0
// 132 133 134 135 | D15 D14 D13 D12
// 148 149 150 151 | D11 D10 D9 D8
// 164 165 166 167 | D7 D6 D5 D4
// 180 181 182 183 | D3 D2 D1 D0
// 196 197 198 199 | D15 D14 D13 D12
// 212 213 214 215 | D11 D10 D9 D8
// 228 229 230 231 | D7 D6 D5 D4
// 244 245 246 247 | D3 D2 D1 D0
PTR_ADDI T0, A0, 244 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 228 * 8
GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
PTR_ADDI T0, A0, 212 * 8
GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
PTR_ADDI T0, A0, 196 * 8
GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
GNMSUB xvf, d, G7, G15, D0, G7, G6, G15, D1, G6, G5, G15, D2, G5, G4, G15, D3, G4, \
G7, G14, D4, G7, G6, G14, D5, G6, G5, G14, D6, G5, G4, G14, D7, G4, \
G7, G13, D8, G7, G6, G13, D9, G6, G5, G13, D10, G5, G4, G13, D11, G4, \
G7, G12, D12, G7, G6, G12, D13, G6, G5, G12, D14, G5, G4, G12, D15, G4
PTR_ADDI T0, A0, 180 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 164 * 8
GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
PTR_ADDI T0, A0, 148 * 8
GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
PTR_ADDI T0, A0, 132 * 8
GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
GNMSUB xvf, d, G7, G11, D0, G7, G6, G11, D1, G6, G5, G11, D2, G5, G4, G11, D3, G4, \
G7, G10, D4, G7, G6, G10, D5, G6, G5, G10, D6, G5, G4, G10, D7, G4, \
G7, G9, D8, G7, G6, G9, D9, G6, G5, G9, D10, G5, G4, G9, D11, G4, \
G7, G8, D12, G7, G6, G8, D13, G6, G5, G8, D14, G5, G4, G8, D15, G4
PTR_ADDI T0, A0, 116 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 100 * 8
GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
PTR_ADDI T0, A0, 84 * 8
GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
PTR_ADDI T0, A0, 68 * 8
GLDREPL xv, d, D9, T0, 0
xvfmul.d G7, G7, D0
GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
xvfmul.d G6, G6, D4
GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
xvfmul.d G5, G5, D7
GNMSUB xvf, d, G4, G5, D8, G4
xvfmul.d G4, G4, D9
// Store B
.if \N == 4
// x x x x ... x x x x
// b16 b17 b18 b19 ... b28 b29 b30 b31
// b32 b33 b34 b34 ... b44 b45 b46 b47
// b48 b49 b50 b51 ... b60 b61 b62 b63
GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
.elseif \N == 2
// x x x x ... x x x x
// b8 b9 b10 b11 b12 b13 b14 b15
// b16 b17 b18 b19 b20 b21 b22 b23
// b24 b25 b26 b27 b28 b29 b30 b31
GST v, , $vr1, B0, 8 * 8, $vr5, B0, 10 * 8, $vr9, B0, 12 * 8, $vr13, B0, 14 * 8
.elseif \N == 1
// x x x x
// b4 b5 b6 b7
// b8 b9 b10 b11
// b12 b13 b14 b15
GST f, d, $f1, B0, 4 * 8, $f5, B0, 5 * 8, $f9, B0, 6 * 8, $f13, B0, 7 * 8
.endif
// Transpose G7 G6 G5 G4
GTRANSPOSE4x4_D G4, G5, G6, G7, D0, D1, D2, D3, D4, D5
// Store C
.if \N == 4
// x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15
// x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
// x x x x c36 c37 c38 c39 c40 c41 c42 c43 c44 c45 c46 c47
// x x x x c52 c53 c54 c55 c56 c57 c58 c59 c60 c61 c62 c63
GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8, D2, C2, 4 * 8, D3, C3, 4 * 8
.elseif \N == 2
// x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15
// x x x x c20 c21 c22 c23 c24 c25 c26 c27 c28 c29 c30 c31
GST xv, , D0, C0, 4 * 8, D1, C1, 4 * 8
.elseif \N == 1
// x x x x c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15
GST xv, , D0, C0, 4 * 8
.endif
#define G0 U0
#define G1 U4
#define G2 U8
#define G3 U12
GTRANSPOSE4x4_D U0, U4, U8, U12, G0, G1, G2, G3, D0, D1
// A
// G0 G1 G2 G3
// ------------------
// 0 | D9
// 16 17 | D8 D7
// 32 33 34 | D6 D5 D4
// 48 49 50 51 | D3 D2 D1 D0
// 64 65 66 67 | D15 D14 D13 D12
// 80 81 82 83 | D11 D10 D9 D8
// 96 97 98 99 | D7 D6 D5 D4
// 112 113 114 115 | D3 D2 D1 D0
// 128 129 130 131 | D15 D14 D13 D12
// 144 145 146 147 | D11 D10 D9 D8
// 160 161 162 163 | D7 D6 D5 D4
// 176 177 178 179 | D3 D2 D1 D0
// 192 193 194 195 | D15 D14 D13 D12
// 208 209 210 211 | D11 D10 D9 D8
// 224 225 226 227 | D7 D6 D5 D4
// 240 241 242 243 | D3 D2 D1 D0
PTR_ADDI T0, A0, 240 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 224 * 8
GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
PTR_ADDI T0, A0, 208 * 8
GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
PTR_ADDI T0, A0, 192 * 8
GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
GNMSUB xvf, d, G3, G15, D0, G3, G2, G15, D1, G2, G1, G15, D2, G1, G0, G15, D3, G0, \
G3, G14, D4, G3, G2, G14, D5, G2, G1, G14, D6, G1, G0, G14, D7, G0, \
G3, G13, D8, G3, G2, G13, D9, G2, G1, G13, D10, G1, G0, G13, D11, G0, \
G3, G12, D12, G3, G2, G12, D13, G2, G1, G12, D14, G1, G0, G12, D15, G0
PTR_ADDI T0, A0, 176 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 160 * 8
GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
PTR_ADDI T0, A0, 144 * 8
GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
PTR_ADDI T0, A0, 128 * 8
GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
GNMSUB xvf, d, G3, G11, D0, G3, G2, G11, D1, G2, G1, G11, D2, G1, G0, G11, D3, G0, \
G3, G10, D4, G3, G2, G10, D5, G2, G1, G10, D6, G1, G0, G10, D7, G0, \
G3, G9, D8, G3, G2, G9, D9, G2, G1, G9, D10, G1, G0, G9, D11, G0, \
G3, G8, D12, G3, G2, G8, D13, G2, G1, G8, D14, G1, G0, G8, D15, G0
PTR_ADDI T0, A0, 112 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 96 * 8
GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
PTR_ADDI T0, A0, 80 * 8
GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
PTR_ADDI T0, A0, 64 * 8
GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
GNMSUB xvf, d, G3, G7, D0, G3, G2, G7, D1, G2, G1, G7, D2, G1, G0, G7, D3, G0, \
G3, G6, D4, G3, G2, G6, D5, G2, G1, G6, D6, G1, G0, G6, D7, G0, \
G3, G5, D8, G3, G2, G5, D9, G2, G1, G5, D10, G1, G0, G5, D11, G0, \
G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
PTR_ADDI T0, A0, 48 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 32 * 8
GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
PTR_ADDI T0, A0, 16 * 8
GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
PTR_ADDI T0, A0, 0 * 8
GLDREPL xv, d, D9, T0, 0
xvfmul.d G3, G3, D0
GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
xvfmul.d G2, G2, D4
GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
xvfmul.d G1, G1, D7
GNMSUB xvf, d, G0, G1, D8, G0
xvfmul.d G0, G0, D9
// Store B
.if \N == 4
// b0 b1 b2 b3 ... b12 b13 b14 b15
// b16 b17 b18 b19 ... b28 b29 b30 b31
// b32 b33 b34 b34 ... b44 b45 b46 b47
// b48 b49 b50 b51 ... b60 b61 b62 b63
GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
.elseif \N == 2
// b0 b1 b2 b3 b4 b5 b6 b7
// b8 b9 b10 b11 b12 b13 b14 b15
// b16 b17 b18 b19 b20 b21 b22 b23
// b24 b25 b26 b27 b28 b29 b30 b31
GST v, , $vr0, B0, 0, $vr4, B0, 2 * 8, $vr8, B0, 4 * 8, $vr12, B0, 6 * 8
.elseif \N == 1
// b0 b1 b2 b3
// b4 b5 b6 b7
// b8 b9 b10 b11
// b12 b13 b14 b15
GST f, d, $f0, B0, 0, $f4, B0, 1 * 8, $f8, B0, 2 * 8, $f12, B0, 3 * 8
.endif
// Transpose C3 C2 C1 C0
GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
// Store C
.if \N == 4
// c0 c1 c2 c3 ... c12 c13 c14 c15
// c16 c17 c18 c19 ... c28 c29 c30 c31
// c32 c33 c34 c34 ... c44 c45 c46 c47
// c48 c49 c50 c51 ... c60 c61 c62 c63
GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
.elseif \N == 2
// c0 c1 c2 c3 ... c12 c13 c14 c15
// c16 c17 c18 c19 ... c28 c29 c30 c31
GST xv, , D0, C0, 0, D1, C1, 0
.elseif \N == 1
// c0 c1 c2 c3 ... c12 c13 c14 c15
GST xv, , D0, C0, 0
.endif
#undef G0
#undef G1
#undef G2
#undef G3
#undef G4
#undef G5
#undef G6
#undef G7
#undef G8
#undef G9
#undef G10
#undef G11
#undef G12
#undef G13
#undef G14
#undef G15
.endm
.macro dsolve_8 N
// if N = 4 the data layout of C is as follows:
// U0 U1
// U2 U3
// U4 U5
// U6 U7
// if N = 2 the dat layout of C is as follows:
// U0 U1
// U2 U3
// if N = 1 the dat layout of C is as follows:
// U0 U1
// The matrix A has dimensions of 8x8, and
// it will be divided into 2 segments for processing.
#define G4 U1
#define G5 U3
#define G6 U5
#define G7 U7
// Transpose U7 U5 U3 U1
GTRANSPOSE4x4_D U1, U3, U5, U7, G4, G5, G6, G7, D0, D1
// A
// G4 G5 G6 G7
// ---------------
// 36 | D9
// 44 45 | D8 D7
// 52 53 54 | D6 D5 D4
// 60 61 62 63 | D3 D2 D1 D0
PTR_ADDI T0, A0, 60 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 52 * 8
GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
PTR_ADDI T0, A0, 44 * 8
GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
PTR_ADDI T0, A0, 36 * 8
GLDREPL xv, d, D9, T0, 0
xvfmul.d G7, G7, D0
GNMSUB xvf, d, G6, G7, D1, G6, G5, G7, D2, G5, G4, G7, D3, G4
xvfmul.d G6, G6, D4
GNMSUB xvf, d, G5, G6, D5, G5, G4, G6, D6, G4
xvfmul.d G5, G5, D7
GNMSUB xvf, d, G4, G5, D8, G4
xvfmul.d G4, G4, D9
// Store B
.if \N == 4
GST xv, , G4, B0, 16 * 8, G5, B0, 20 * 8, G6, B0, 24 * 8, G7, B0, 28 * 8
.elseif \N == 2
GST v, , $vr1, B0, 8 * 8, $vr3, B0, 10 * 8, $vr5, B0, 12 * 8, $vr7, B0, 14 * 8
.elseif \N == 1
GST f, d, $f1, B0, 4 * 8, $f3, B0, 5 * 8, $f5, B0, 6 * 8, $f7, B0, 7 * 8
.endif
// Transpose
GTRANSPOSE4x4_D G4, G5, G6, G7, D4, D5, D6, D7, D8, D9
// Store C
.if \N == 4
GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8, D6, C2, 4 * 8, D7, C3, 4 * 8
.elseif \N == 2
GST xv, , D4, C0, 4 * 8, D5, C1, 4 * 8
.elseif \N == 1
GST xv, , D4, C0, 4 * 8
.endif
#define G0 U0
#define G1 U2
#define G2 U4
#define G3 U6
// Transpose U6 U4 U2 U0
GTRANSPOSE4x4_D U0, U2, U4, U6, G0, G1, G2, G3, D0, D1
// A
// G0 G1 G2 G3
//-----------------
// 0 | D9
// 8 9 | D8 D7
// 16 17 18 | D6 D5 D4
// 24 25 26 27 | D3 D2 D1 D0
// 32 33 34 35 | D15 D14 D13 D12
// 40 41 42 43 | D11 D10 D9 D8
// 48 49 50 51 | D7 D6 D5 D4
// 56 57 58 59 | D3 D2 D1 D0
PTR_ADDI T0, A0, 56 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 48 * 8
GLDREPL xv, d, D7, T0, 0, D6, T0, 1 * 8, D5, T0, 2 * 8, D4, T0, 3 * 8
PTR_ADDI T0, A0, 40 * 8
GLDREPL xv, d, D11, T0, 0, D10, T0, 1 * 8, D9, T0, 2 * 8, D8, T0, 3 * 8
PTR_ADDI T0, A0, 32 * 8
GLDREPL xv, d, D15, T0, 0, D14, T0, 1 * 8, D13, T0, 2 * 8, D12, T0, 3 * 8
GNMSUB xvf, d, G3, G7, D0, G3, G2, G7, D1, G2, G1, G7, D2, G1, G0, G7, D3, G0, \
G3, G6, D4, G3, G2, G6, D5, G2, G1, G6, D6, G1, G0, G6, D7, G0, \
G3, G5, D8, G3, G2, G5, D9, G2, G1, G5, D10, G1, G0, G5, D11, G0, \
G3, G4, D12, G3, G2, G4, D13, G2, G1, G4, D14, G1, G0, G4, D15, G0
PTR_ADDI T0, A0, 24 * 8
GLDREPL xv, d, D3, T0, 0, D2, T0, 1 * 8, D1, T0, 2 * 8, D0, T0, 3 * 8
PTR_ADDI T0, A0, 16 * 8
GLDREPL xv, d, D6, T0, 0, D5, T0, 1 * 8, D4, T0, 2 * 8
PTR_ADDI T0, A0, 8 * 8
GLDREPL xv, d, D8, T0, 0, D7, T0, 1 * 8
PTR_ADDI T0, A0, 0 * 8
GLDREPL xv, d, D9, T0, 0
xvfmul.d G3, G3, D0
GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
xvfmul.d G2, G2, D4
GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
xvfmul.d G1, G1, D7
GNMSUB xvf, d, G0, G1, D8, G0
xvfmul.d G0, G0, D9
// Store B
.if \N == 4
GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
.elseif \N == 2
GST v, , $vr0, B0, 0, $vr2, B0, 2 * 8, $vr4, B0, 4 * 8, $vr6, B0, 6 * 8
.elseif \N == 1
GST f, d, $f0, B0, 0, $f2, B0, 1 * 8, $f4, B0, 2 * 8, $f6, B0, 3 * 8
.endif
// Transpose
GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
// Store C
.if \N == 4
GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
.elseif \N == 2
GST xv, , D0, C0, 0, D1, C1, 0
.elseif \N == 1
GST xv, , D0, C0, 0
.endif
#undef G0
#undef G1
#undef G2
#undef G3
#undef G4
#undef G5
#undef G6
#undef G7
.endm
.macro dsolve_4 N
// if N = 4 the data layout of C is as follows:
// U0
// U1
// U2
// U3
// if N = 2 the dat layout of C is as follows:
// U0
// U1
// if N = 1 the dat layout of C is as follows:
// U0
// The matrix A has dimensions of 4x4, and
// it will be divided into 1 segments for processing.
#define G0 U0
#define G1 U1
#define G2 U2
#define G3 U3
// Transpose U3 U2 U1 U0
GTRANSPOSE4x4_D U0, U1, U2, U3, G0, G1, G2, G3, D0, D1
// A
// G0 G1 G2 G3
//-------------
// 0 | D9
// 4 5 | D8 D7
// 8 9 10 | D6 D5 D4
// 12 13 14 15 | D3 D2 D1 D0
GLDREPL xv, d, D3, A0, 12 * 8, D2, A0, 13 * 8, D1, A0, 14 * 8, D0, A0, 15 * 8, \
D6, A0, 8 * 8, D5, A0, 9 * 8, D4, A0, 10 * 8, \
D8, A0, 4 * 8, D7, A0, 5 * 8, \
D9, A0, 0 * 8
xvfmul.d G3, G3, D0
GNMSUB xvf, d, G2, G3, D1, G2, G1, G3, D2, G1, G0, G3, D3, G0
xvfmul.d G2, G2, D4
GNMSUB xvf, d, G1, G2, D5, G1, G0, G2, D6, G0
xvfmul.d G1, G1, D7
GNMSUB xvf, d, G0, G1, D8, G0
xvfmul.d G0, G0, D9
// Store B
.if \N == 4
GST xv, , G0, B0, 0, G1, B0, 4 * 8, G2, B0, 8 * 8, G3, B0, 12 * 8
.elseif \N == 2
GST v, , $vr0, B0, 0, $vr1, B0, 2 * 8, $vr2, B0, 4 * 8, $vr3, B0, 6 * 8
.elseif \N == 1
GST f, d, $f0, B0, 0, $f1, B0, 1 * 8, $f2, B0, 2 * 8, $f3, B0, 3 * 8
.endif
// Transpose
GTRANSPOSE4x4_D G0, G1, G2, G3, D0, D1, D2, D3, D4, D5
// Store C
.if \N == 4
GST xv, , D0, C0, 0, D1, C1, 0, D2, C2, 0, D3, C3, 0
.elseif \N == 2
GST xv, , D0, C0, 0, D1, C1, 0
.elseif \N == 1
GST xv, , D0, C0, 0
.endif
#undef G0
#undef G1
#undef G2
#undef G3
.endm
.macro dsolve_2 N
#define G0 U2
#define G1 U3
// Transpose
GSBUTTERFLY xv, d, G0, G1, U1, U0
// A
// G0 G1
// ------
// 0 | D2
// 2 3 | D1 D0
GLDREPL xv, d, D2, A0, 0, D1, A0, 2 * 8, D0, A0, 3 * 8
xvfmul.d G1, G1, D0
GNMSUB xvf, d, G0, G1, D1, G0
xvfmul.d G0, G0, D2
// Store B
.if \N == 4
GST xv, , G0, B0, 0, G1, B0, 4 * 8
.elseif \N == 2
GST v, , $vr2, B0, 0, $vr3, B0, 2 * 8
.elseif \N == 1
GST f, d, $f2, B0, 0, $f3, B0, 8
.endif
// Transpose
GSBUTTERFLY xv, d, D0, D1, G1, G0
// Store C
.if \N == 4
vst $vr16, C0, 0x00
vst $vr17, C1, 0x00
xvstelm.d D0, C2, 0x00, 0x02
xvstelm.d D1, C3, 0x00, 0x02
xvstelm.d D0, C2, 0x08, 0x03
xvstelm.d D1, C3, 0x08, 0x03
.elseif \N == 2
GST v, , $vr16, C0, 0, $vr17, C1, 0
.elseif \N == 1
GST v, , $vr16, C0, 0
.endif
#undef G0
#undef G1
.endm
.macro dgemm_dsolve_16x4
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_16x4_load
dgemm_16x4
b .L_dsolve_16x4
.L_dsolve_16x4_load:
// Load C
GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
/********************** solver ******************/
.L_dsolve_16x4:
PTR_ADDI A0, T1, -(16 * 8 * 8)
PTR_ADDI A0, A0, -(16 * 8 * 8)
PTR_ADDI B0, T2, -(16 * 4 * 8)
dsolve_16 4
.endm
.macro dgemm_dsolve_1x4
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_1x4_load
dgemm_1x4
b .L_dsolve_1x4
.L_dsolve_1x4_load:
// Load C
fld.d $f0, C0, 0x00
fld.d $f1, C1, 0x00
fld.d $f2, C2, 0x00
fld.d $f3, C3, 0x00
xvinsve0.d U0, U1, 0x01
xvinsve0.d U0, U2, 0x02
xvinsve0.d U0, U3, 0x03
.L_dsolve_1x4:
or A0, T1, T1
or B0, T2, T2
GLDREPL xv, d, D0, A0, -1 * 8
GMUL xvf, d, U0, U0, D0
// Store C
xvstelm.d U0, C0, 0x00, 0x00
xvstelm.d U0, C1, 0x00, 0x01
xvstelm.d U0, C2, 0x00, 0x02
xvstelm.d U0, C3, 0x00, 0x03
// Store B
xvst U0, B0, -32
.endm
.macro dgemm_dsolve_2x4
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_2x4_load
dgemm_2x4
b .L_dsolve_2x4
.L_dsolve_2x4_load:
/* Load C0 */
xvld U0, C0, 0x00
/* Load C1 */
xvld U1, C1, 0x00
/* Load C2 */
xvld U2, C2, 0x00
/* Load C3 */
xvld U3, C3, 0x00
xvpermi.q U0, U2, 0x02
xvpermi.q U1, U3, 0x02
/********************** solver ******************/
.L_dsolve_2x4:
PTR_ADDI A0, T1, -(2 * 2 * 8)
PTR_ADDI B0, T2, -(2 * 4 * 8)
dsolve_2 4
.endm
.macro dgemm_dsolve_4x4
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_4x4_load
dgemm_4x4
b .L_dsolve_4x4
.L_dsolve_4x4_load:
/* Load C0 */
xvld U0, C0, 0x00
/* Load C1 */
xvld U1, C1, 0x00
/* Load C2 */
xvld U2, C2, 0x00
/* Load C3 */
xvld U3, C3, 0x00
/************** solver *****************/
.L_dsolve_4x4:
PTR_ADDI A0, T1, -(4 * 4 * 8)
PTR_ADDI B0, T2, -(4 * 4 * 8)
dsolve_4 4
.endm
.macro dgemm_dsolve_8x4
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_8x4_load
dgemm_8x4
b .L_dsolve_8x4
.L_dsolve_8x4_load:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
/* Load C1 */
xvld U2, C1, 0x00
xvld U3, C1, 0x20
/* Load C2 */
xvld U4, C2, 0x00
xvld U5, C2, 0x20
/* Load C3 */
xvld U6, C3, 0x00
xvld U7, C3, 0x20
/********* solver *********/
.L_dsolve_8x4:
PTR_ADDI A0, T1, -(8 * 8 * 8)
PTR_ADDI B0, T2, -(8 * 4 * 8)
dsolve_8 4
.endm
.macro dgemm_dsolve_4x2
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_4x2_load
dgemm_4x2
b .L_dsolve_4x2
.L_dsolve_4x2_load:
/* Load C0 */
xvld U0, C0, 0x00
/* Load C1 */
xvld U1, C1, 0x00
.L_dsolve_4x2:
PTR_ADDI A0, T1, -(4 * 4 * 8)
PTR_ADDI B0, T2, -(4 * 2 * 8)
dsolve_4 2
.endm
.macro dgemm_dsolve_2x2
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_2x2_load
dgemm_2x2
b .L_dsolve_2x2
.L_dsolve_2x2_load:
/* Load C0 */
xvld U0, C0, 0x00
/* Load C1 */
xvld U1, C1, 0x00
.L_dsolve_2x2:
PTR_ADDI A0, T1, -(2 * 2 * 8)
PTR_ADDI B0, T2, -(2 * 2 * 8)
dsolve_2 2
.endm
.macro dgemm_dsolve_8x2
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_8x2_load
dgemm_8x2
b .L_dsolve_8x2
.L_dsolve_8x2_load:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
/* Load C1 */
xvld U2, C1, 0x00
xvld U3, C1, 0x20
.L_dsolve_8x2:
PTR_ADDI A0, T1, -(8 * 8 * 8)
PTR_ADDI B0, T2, -(8 * 2 * 8)
dsolve_8 2
.endm
.macro dgemm_dsolve_16x2
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_16x2_load
dgemm_16x2
b .L_dsolve_16x2
.L_dsolve_16x2_load:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
xvld U2, C0, 0x40
xvld U3, C0, 0x60
/* Load C1 */
xvld U4, C1, 0x00
xvld U5, C1, 0x20
xvld U6, C1, 0x40
xvld U7, C1, 0x60
.L_dsolve_16x2:
PTR_ADDI A0, T1, -(16 * 8 * 8)
PTR_ADDI A0, A0, -(16 * 8 * 8)
PTR_ADDI B0, T2, -(16 * 2 * 8)
dsolve_16 2
.endm
.macro dgemm_dsolve_2x1
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_2x1_load
dgemm_2x1
b .L_dsolve_2x1
.L_dsolve_2x1_load:
/* Load C0 */
xvld U0, C0, 0x00
.L_dsolve_2x1:
PTR_ADDI A0, T1, -(2 * 2 * 8)
PTR_ADDI B0, T2, -(2 * 1 * 8)
dsolve_2 1
.endm
.macro dgemm_dsolve_4x1
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_4x1_load
dgemm_4x1
b .L_dsolve_4x1
.L_dsolve_4x1_load:
/* Load C0 */
xvld U0, C0, 0x00
.L_dsolve_4x1:
PTR_ADDI A0, T1, -(4 * 4 * 8)
PTR_ADDI B0, T2, -(4 * 1 * 8)
dsolve_4 1
.endm
.macro dgemm_dsolve_8x1
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_8x1_load
dgemm_8x1
b .L_dsolve_8x1
.L_dsolve_8x1_load:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
.L_dsolve_8x1:
PTR_ADDI A0, T1, -(8 * 8 * 8)
PTR_ADDI B0, T2, -(8 * 1 * 8)
dsolve_8 1
.endm
.macro dgemm_dsolve_16x1
or T1, A0, A0
or T2, B0, B0
bge ZERO, L, .L_dsolve_16x1_load
dgemm_16x1
b .L_dsolve_16x1
.L_dsolve_16x1_load:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
xvld U2, C0, 0x40
xvld U3, C0, 0x60
.L_dsolve_16x1:
PTR_ADDI A0, T1, -(16 * 8 * 8)
PTR_ADDI A0, A0, -(16 * 8 * 8)
PTR_ADDI B0, T2, -(16 * 1 * 8)
dsolve_16 1
.endm
PROLOGUE
push_if_used 26, 32
PTR_SLLI LDC, LDC, 3
/* if (!(N >> 2)) goto L_N3 */
PTR_SRAI J, N, 2 /* J = bn >> 2 */
andi N, N, 0x03
beq ZERO, J, .L_N3
.align 5
.L_J1:
PTR_ADDI J, J, -1
PTR_ADD KK, M, OFFSET
andi I, M, 15
beq ZERO, I, .L_M16
andi I, M, 1
beqz I, .L_M2
.L_M1:
PTR_ADDI T0, M, -1
PTR_SLLI T0, T0, 3
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */
PTR_ADD CC, T0, C /* c + (m - 1) */
PTR_SLLI T0, KK, 5
PTR_ADD B0, B, T0 /* b + 4 * kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
dgemm_dsolve_1x4
PTR_ADDI KK, KK, -1
.L_M2:
andi I, M, 2
beqz I, .L_M4
PTR_SRLI T0, M, 1
PTR_SLLI T0, T0, 1
PTR_ADDI T0, T0, -2
PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */
PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/
PTR_SLLI T1, KK, 4
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */
PTR_SLLI T0, KK, 5
PTR_ADD B0, B, T0 /* b + 4 * kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
dgemm_dsolve_2x4
PTR_ADDI KK, KK, -2
.L_M4:
andi I, M, 4
beqz I, .L_M8
PTR_SRLI T0, M, 2
PTR_SLLI T0, T0, 2
PTR_ADDI T0, T0, -4
PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */
PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/
PTR_SLLI T1, KK, 5
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */
PTR_SLLI T0, KK, 5
PTR_ADD B0, B, T0 /* b + 4 * kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
dgemm_dsolve_4x4
PTR_ADDI KK, KK, -4
.L_M8:
andi I, M, 8
beqz I, .L_M16
PTR_SRLI T0, M, 3
PTR_SLLI T0, T0, 3
PTR_ADDI T0, T0, -8
PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */
PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/
PTR_SLLI T1, KK, 6
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */
PTR_SLLI T0, KK, 5
PTR_ADD B0, B, T0 /* b + 4 * kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
dgemm_dsolve_8x4
PTR_ADDI KK, KK, -8
.L_M16:
PTR_SRAI I, M, 4 /* I = bm >> 4 */
beq ZERO, I, .L_M0
PTR_SRLI T0, M, 4
PTR_SLLI T0, T0, 4
PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */
PTR_SLLI T0, T0, 3
PTR_MUL AA, T0, K
PTR_ADD AA, A, AA
PTR_ADD CC, C, T0
.align 5
.L_I1:
PTR_SLLI T0, KK, 5
PTR_ADD B0, B, T0
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO, C1, C0, LDC, C2, C1, LDC, C3, C2, LDC
PTR_SLLI T0, KK, 7
PTR_ADD A0, AA, T0
dgemm_dsolve_16x4
PTR_ADDI I, I, -1
PTR_ADDI KK, KK, -16
PTR_ADDI CC, CC, -(16 * 8)
PTR_SLLI T0, K, 7
PTR_SUB AA, AA, T0
blt ZERO, I, .L_I1
.L_M0:
PTR_SLLI T0, K, 3
PTR_ALSL B, T0, B, 2 // b += 4 * k;
PTR_ALSL C, LDC, C, 2 // c += 4 * ldc
blt ZERO, J, .L_J1
.L_N3:
andi J, N, 2
beq ZERO, J, .L_N1
PTR_ADD KK, M, OFFSET
andi I, M, 15
beq ZERO, I, .L_N3_M16
andi I, M, 1
beqz I, .L_N3_M2
.L_N3_M1:
PTR_ADDI KK, KK, -1
PTR_ADDI T0, M, -1
PTR_SLLI T0, T0, 3
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */
PTR_ADD CC, T0, C /* c + (m - 1) */
PTR_SLLI T0, KK, 4
PTR_ADD B0, B, T0 /* b + 2 * kk */
GADD , d, C0, CC, ZERO, C1, C0, LDC
// dgemm_dsolve_1x2
GLD f, d, $f0, A0, 0, $f1, C0, 0, $f2, C1, 0
GMUL f, d, $f1, $f1, $f0, $f2, $f2, $f0
GST f, d, $f1, C0, 0, $f2, C1, 0, $f1, B0, 0, $f2, B0, 8
.L_N3_M2:
andi I, M, 2
beqz I, .L_N3_M4
PTR_SRLI T0, M, 1
PTR_SLLI T0, T0, 1
PTR_ADDI T0, T0, -2
PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */
PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/
PTR_SLLI T1, KK, 4
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */
PTR_SLLI T0, KK, 4
PTR_ADD B0, B, T0 /* b + 2 * kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO, C1, C0, LDC
dgemm_dsolve_2x2
PTR_ADDI KK, KK, -2
.L_N3_M4:
andi I, M, 4
beqz I, .L_N3_M8
PTR_SRLI T0, M, 2
PTR_SLLI T0, T0, 2
PTR_ADDI T0, T0, -4
PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */
PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/
PTR_SLLI T1, KK, 5
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */
PTR_SLLI T0, KK, 4
PTR_ADD B0, B, T0 /* b + 2 * kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO, C1, C0, LDC
dgemm_dsolve_4x2
PTR_ADDI KK, KK, -4
.L_N3_M8:
andi I, M, 8
beqz I, .L_N3_M16
PTR_SRLI T0, M, 3
PTR_SLLI T0, T0, 3
PTR_ADDI T0, T0, -8
PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */
PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/
PTR_SLLI T1, KK, 6
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */
PTR_SLLI T0, KK, 4
PTR_ADD B0, B, T0 /* b + 2 * kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO, C1, C0, LDC
dgemm_dsolve_8x2
PTR_ADDI KK, KK, -8
.L_N3_M16:
PTR_SRAI I, M, 4 /* I = bm >> 4 */
beq ZERO, I, .L_N3_M0
PTR_SRLI T0, M, 4
PTR_SLLI T0, T0, 4
PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */
PTR_SLLI T0, T0, 3
PTR_MUL AA, T0, K
PTR_ADD AA, A, AA
PTR_ADD CC, C, T0
.align 5
.L_N3_I1:
PTR_SLLI T0, KK, 4
PTR_ADD B0, B, T0
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO, C1, C0, LDC
PTR_SLLI T0, KK, 7
PTR_ADD A0, AA, T0
dgemm_dsolve_16x2
PTR_ADDI I, I, -1
PTR_ADDI KK, KK, -16
PTR_ADDI CC, CC, -(16 * 8)
PTR_SLLI T0, K, 7
PTR_SUB AA, AA, T0
blt ZERO, I, .L_N3_I1
.L_N3_M0:
PTR_SLLI T0, K, 3
PTR_ALSL B, T0, B, 1 // b += 2 * k;
PTR_ALSL C, LDC, C, 1 // c += 2 * ldc
.L_N1:
andi J, N, 1
beq ZERO, J, .L_N0
PTR_ADD KK, M, OFFSET
andi I, M, 15
beq ZERO, I, .L_N1_M16
andi I, M, 1
beqz I, .L_N1_M2
.L_N1_M1:
PTR_ADDI KK, KK, -1
PTR_ADDI T0, M, -1
PTR_SLLI T0, T0, 3
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ALSL A0, KK, AA, 3 /* a + (m - 1) * k + kk */
PTR_ADD CC, T0, C /* c + (m - 1) */
PTR_SLLI T0, KK, 3
PTR_ADD B0, B, T0 /* b + kk */
GADD , d, C0, CC, ZERO
// dgemm_dsolve_1x1
GLD f, d, $f0, A0, 0, $f1, C0, 0
GMUL f, d, $f1, $f1, $f0
GST f, d, $f1, C0, 0, $f1, B0, 0
.L_N1_M2:
andi I, M, 2
beqz I, .L_N1_M4
PTR_SRLI T0, M, 1
PTR_SLLI T0, T0, 1
PTR_ADDI T0, T0, -2
PTR_SLLI T0, T0, 3 /* ((m & -2) - 2) */
PTR_ADD CC, T0, C /* c + ((m & -2) - 2)*/
PTR_SLLI T1, KK, 4
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ADD A0, AA, T1 /* a + ((m & -2) - 2) * k + 2 * kk */
PTR_SLLI T0, KK, 3
PTR_ADD B0, B, T0 /* b + kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO
dgemm_dsolve_2x1
PTR_ADDI KK, KK, -2
.L_N1_M4:
andi I, M, 4
beqz I, .L_N1_M8
PTR_SRLI T0, M, 2
PTR_SLLI T0, T0, 2
PTR_ADDI T0, T0, -4
PTR_SLLI T0, T0, 3 /* ((m & -4) - 4) */
PTR_ADD CC, T0, C /* c + ((m & -4) - 4)*/
PTR_SLLI T1, KK, 5
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ADD A0, AA, T1 /* a + ((m & -4) - 4) * k + 4 * kk */
PTR_SLLI T0, KK, 3
PTR_ADD B0, B, T0 /* b + kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO
dgemm_dsolve_4x1
PTR_ADDI KK, KK, -4
.L_N1_M8:
andi I, M, 8
beqz I, .L_N1_M16
PTR_SRLI T0, M, 3
PTR_SLLI T0, T0, 3
PTR_ADDI T0, T0, -8
PTR_SLLI T0, T0, 3 /* ((m & -8) - 8) */
PTR_ADD CC, T0, C /* c + ((m & -8) - 8)*/
PTR_SLLI T1, KK, 6
PTR_MUL AA, T0, K
PTR_ADD AA, AA, A
PTR_ADD A0, AA, T1 /* a + ((m & -8) - 8) * k + 8 * kk */
PTR_SLLI T0, KK, 3
PTR_ADD B0, B, T0 /* b + kk */
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO
dgemm_dsolve_8x1
PTR_ADDI KK, KK, -8
.L_N1_M16:
PTR_SRAI I, M, 4 /* I = bm >> 4 */
beq ZERO, I, .L_N1_M0
PTR_SRLI T0, M, 4
PTR_SLLI T0, T0, 4
PTR_ADDI T0, T0, -16 /* ((M & -16)) - 16) */
PTR_SLLI T0, T0, 3
PTR_MUL AA, T0, K
PTR_ADD AA, A, AA
PTR_ADD CC, C, T0
.align 5
.L_N1_I1:
PTR_SLLI T0, KK, 3
PTR_ADD B0, B, T0
PTR_SUB L, K, KK
GADD , d, C0, CC, ZERO
PTR_SLLI T0, KK, 7
PTR_ADD A0, AA, T0
dgemm_dsolve_16x1
PTR_ADDI I, I, -1
PTR_ADDI KK, KK, -16
PTR_ADDI CC, CC, -(16 * 8)
PTR_SLLI T0, K, 7
PTR_SUB AA, AA, T0
blt ZERO, I, .L_N1_I1
.L_N1_M0:
.L_N0:
pop_if_used 26, 32
jirl $r0, $r1, 0x0
EPILOGUE