OpenBLAS/kernel/loongarch64/dtrsm_kernel_macro.S

2148 lines
48 KiB
ArmAsm

/*******************************************************************************
Copyright (c) 2023, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*******************************************************************************/
/************** Dgemm Kernel 16x4 ****************/
.macro KERNEL2x16x4
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvfmadd.d D2, U10, U12, D2
xvfmadd.d D3, U11, U12, D3
xvld U2, A0, 0x40
xvfmadd.d D4, U8, U13, D4
xvfmadd.d D5, U9, U13, D5
xvld U3, A0, 0x60
xvfmadd.d D6, U10, U13, D6
xvfmadd.d D7, U11, U13, D7
xvldrepl.d U4, B0, 0x00
xvfmadd.d D8, U8, U14, D8
xvfmadd.d D9, U9, U14, D9
preld 0, B0, B_PRE
xvldrepl.d U5, B0, 0x08
xvfmadd.d D10, U10, U14, D10
xvfmadd.d D11, U11, U14, D11
preld 0, A0, A_PRE
xvldrepl.d U6, B0, 0x10
xvfmadd.d D12, U8, U15, D12
xvfmadd.d D13, U9, U15, D13
preld 0, A0, A_PRE + 0x40
xvldrepl.d U7, B0, 0x18
xvfmadd.d D14, U10, U15, D14
xvfmadd.d D15, U11, U15, D15
addi.d A0, A0, 0x80
addi.d B0, B0, 0x20
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvld U9, A0, 0x20
xvfmadd.d D2, U2, U4, D2
xvfmadd.d D3, U3, U4, D3
xvld U10, A0, 0x40
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvld U11, A0, 0x60
xvfmadd.d D6, U2, U5, D6
xvfmadd.d D7, U3, U5, D7
xvldrepl.d U12, B0, 0x00
xvfmadd.d D8, U0, U6, D8
xvfmadd.d D9, U1, U6, D9
preld 0, B0, B_PRE
xvldrepl.d U13, B0, 0x08
xvfmadd.d D10, U2, U6, D10
xvfmadd.d D11, U3, U6, D11
preld 0, A0, A_PRE
xvldrepl.d U14, B0, 0x10
xvfmadd.d D12, U0, U7, D12
xvfmadd.d D13, U1, U7, D13
preld 0, A0, A_PRE + 0x40
xvldrepl.d U15, B0, 0x18
xvfmadd.d D14, U2, U7, D14
xvfmadd.d D15, U3, U7, D15
addi.d A0, A0, 0x80
addi.d B0, B0, 0x20
.endm
.macro KERNEL2x16x4_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvfmadd.d D2, U10, U12, D2
xvfmadd.d D3, U11, U12, D3
xvld U2, A0, 0x40
xvfmadd.d D4, U8, U13, D4
xvfmadd.d D5, U9, U13, D5
xvld U3, A0, 0x60
xvfmadd.d D6, U10, U13, D6
xvfmadd.d D7, U11, U13, D7
xvldrepl.d U4, B0, 0x00
xvfmadd.d D8, U8, U14, D8
xvfmadd.d D9, U9, U14, D9
preld 0, B0, B_PRE
xvldrepl.d U5, B0, 0x08
xvfmadd.d D10, U10, U14, D10
xvfmadd.d D11, U11, U14, D11
preld 0, A0, A_PRE
xvldrepl.d U6, B0, 0x10
xvfmadd.d D12, U8, U15, D12
xvfmadd.d D13, U9, U15, D13
preld 0, A0, A_PRE + 0x40
xvldrepl.d U7, B0, 0x18
xvfmadd.d D14, U10, U15, D14
xvfmadd.d D15, U11, U15, D15
addi.d A0, A0, 0x80
addi.d B0, B0, 0x20
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvfmadd.d D2, U2, U4, D2
xvfmadd.d D3, U3, U4, D3
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvfmadd.d D6, U2, U5, D6
xvfmadd.d D7, U3, U5, D7
xvfmadd.d D8, U0, U6, D8
xvfmadd.d D9, U1, U6, D9
preld 0, B0, B_PRE
xvfmadd.d D10, U2, U6, D10
xvfmadd.d D11, U3, U6, D11
preld 0, A0, A_PRE
xvfmadd.d D12, U0, U7, D12
xvfmadd.d D13, U1, U7, D13
preld 0, A0, A_PRE + 0x40
xvfmadd.d D14, U2, U7, D14
xvfmadd.d D15, U3, U7, D15
.endm
.macro KERNEL8x16x4
.rept 4
KERNEL2x16x4
.endr
.endm
.macro KERNEL8x16x4_END
.rept 3
KERNEL2x16x4
.endr
KERNEL2x16x4_END
.endm
.macro KERNEL2x8x4
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U8, U13, D4
xvfmadd.d D5, U9, U13, D5
xvldrepl.d U6, B0, 0x10
xvfmadd.d D8, U8, U14, D8
xvfmadd.d D9, U9, U14, D9
xvldrepl.d U7, B0, 0x18
xvfmadd.d D12, U8, U15, D12
xvfmadd.d D13, U9, U15, D13
addi.d A0, A0, 0x40
addi.d B0, B0, 0x20
xvld U8, A0, 0x00
xvld U9, A0, 0x20
xvldrepl.d U12, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvldrepl.d U13, B0, 0x08
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvldrepl.d U14, B0, 0x10
xvfmadd.d D8, U0, U6, D8
xvfmadd.d D9, U1, U6, D9
xvldrepl.d U15, B0, 0x18
xvfmadd.d D12, U0, U7, D12
xvfmadd.d D13, U1, U7, D13
addi.d A0, A0, 0x40
addi.d B0, B0, 0x20
.endm
.macro KERNEL2x8x4_END
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U8, U13, D4
xvfmadd.d D5, U9, U13, D5
xvldrepl.d U6, B0, 0x10
xvfmadd.d D8, U8, U14, D8
xvfmadd.d D9, U9, U14, D9
xvldrepl.d U7, B0, 0x18
xvfmadd.d D12, U8, U15, D12
xvfmadd.d D13, U9, U15, D13
addi.d A0, A0, 0x40
addi.d B0, B0, 0x20
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvfmadd.d D8, U0, U6, D8
xvfmadd.d D9, U1, U6, D9
xvfmadd.d D12, U0, U7, D12
xvfmadd.d D13, U1, U7, D13
.endm
.macro KERNEL8x8x4
.rept 4
KERNEL2x8x4
.endr
.endm
.macro KERNEL8x8x4_END
.rept 3
KERNEL2x8x4
.endr
KERNEL2x8x4_END
.endm
.macro KERNEL2x4x4
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U8, U12, D0
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U8, U13, D4
xvldrepl.d U6, B0, 0x10
xvfmadd.d D8, U8, U14, D8
xvldrepl.d U7, B0, 0x18
xvfmadd.d D12, U8, U15, D12
addi.d A0, A0, 0x20
addi.d B0, B0, 0x20
xvld U8, A0, 0x00
xvldrepl.d U12, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvldrepl.d U13, B0, 0x08
xvfmadd.d D4, U0, U5, D4
xvldrepl.d U14, B0, 0x10
xvfmadd.d D8, U0, U6, D8
xvldrepl.d U15, B0, 0x18
xvfmadd.d D12, U0, U7, D12
addi.d A0, A0, 0x20
addi.d B0, B0, 0x20
.endm
.macro KERNEL2x4x4_END
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U8, U12, D0
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U8, U13, D4
xvldrepl.d U6, B0, 0x10
xvfmadd.d D8, U8, U14, D8
xvldrepl.d U7, B0, 0x18
xvfmadd.d D12, U8, U15, D12
addi.d A0, A0, 0x20
addi.d B0, B0, 0x20
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D8, U0, U6, D8
xvfmadd.d D12, U0, U7, D12
.endm
.macro KERNEL8x4x4
.rept 4
KERNEL2x4x4
.endr
.endm
.macro KERNEL8x4x4_END
.rept 3
KERNEL2x4x4
.endr
KERNEL2x4x4_END
.endm
.macro KERNEL2x2x4
xvldrepl.d U0, A0, 0x00
xvldrepl.d U1, A0, 0x08
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U4, B0, 0x00
addi.d A0, A0, 0x10
addi.d B0, B0, 0x20
xvldrepl.d U8, A0, 0x00
xvldrepl.d U9, A0, 0x08
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvld U12, B0, 0x00
addi.d A0, A0, 0x10
addi.d B0, B0, 0x20
.endm
.macro KERNEL2x2x4_END
xvldrepl.d U0, A0, 0x00
xvldrepl.d U1, A0, 0x08
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U4, B0, 0x00
addi.d A0, A0, 0x10
addi.d B0, B0, 0x20
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
.endm
.macro KERNEL8x2x4
.rept 4
KERNEL2x2x4
.endr
.endm
.macro KERNEL8x2x4_END
.rept 3
KERNEL2x2x4
.endr
KERNEL2x2x4_END
.endm
.macro KERNEL2x1x4
xvldrepl.d U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvld U4, B0, 0x00
addi.d A0, A0, 0x08
addi.d B0, B0, 0x20
xvldrepl.d U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvld U12, B0, 0x00
addi.d A0, A0, 0x08
addi.d B0, B0, 0x20
.endm
.macro KERNEL2x1x4_END
xvldrepl.d U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvld U4, B0, 0x00
addi.d A0, A0, 0x08
addi.d B0, B0, 0x20
xvfmadd.d D0, U0, U4, D0
.endm
.macro KERNEL8x1x4
.rept 4
KERNEL2x1x4
.endr
.endm
.macro KERNEL8x1x4_END
.rept 3
KERNEL2x1x4
.endr
KERNEL2x1x4_END
.endm
.macro KERNEL2x16x2
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvfmadd.d D2, U10, U12, D2
xvfmadd.d D3, U11, U12, D3
xvld U2, A0, 0x40
xvfmadd.d D4, U8, U13, D4
xvfmadd.d D5, U9, U13, D5
xvld U3, A0, 0x60
xvfmadd.d D6, U10, U13, D6
xvfmadd.d D7, U11, U13, D7
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x80
addi.d B0, B0, 0x10
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvld U9, A0, 0x20
xvfmadd.d D2, U2, U4, D2
xvfmadd.d D3, U3, U4, D3
xvld U10, A0, 0x40
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvld U11, A0, 0x60
xvfmadd.d D6, U2, U5, D6
xvfmadd.d D7, U3, U5, D7
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
addi.d A0, A0, 0x80
addi.d B0, B0, 0x10
.endm
.macro KERNEL2x16x2_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvfmadd.d D2, U10, U12, D2
xvfmadd.d D3, U11, U12, D3
xvld U2, A0, 0x40
xvfmadd.d D4, U8, U13, D4
xvfmadd.d D5, U9, U13, D5
xvld U3, A0, 0x60
xvfmadd.d D6, U10, U13, D6
xvfmadd.d D7, U11, U13, D7
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x80
addi.d B0, B0, 0x10
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvfmadd.d D2, U2, U4, D2
xvfmadd.d D3, U3, U4, D3
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvfmadd.d D6, U2, U5, D6
xvfmadd.d D7, U3, U5, D7
.endm
.macro KERNEL8x16x2
.rept 4
KERNEL2x16x2
.endr
.endm
.macro KERNEL8x16x2_END
.rept 3
KERNEL2x16x2
.endr
KERNEL2x16x2_END
.endm
.macro KERNEL2x8x2
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvfmadd.d D4, U8, U13, D4
xvfmadd.d D5, U9, U13, D5
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x40
addi.d B0, B0, 0x10
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvld U9, A0, 0x20
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
addi.d A0, A0, 0x40
addi.d B0, B0, 0x10
.endm
.macro KERNEL2x8x2_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvfmadd.d D4, U8, U13, D4
xvfmadd.d D5, U9, U13, D5
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x40
addi.d B0, B0, 0x10
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
.endm
.macro KERNEL8x8x2
.rept 4
KERNEL2x8x2
.endr
.endm
.macro KERNEL8x8x2_END
.rept 3
KERNEL2x8x2
.endr
KERNEL2x8x2_END
.endm
.macro KERNEL2x4x2
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D4, U8, U13, D4
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x20
addi.d B0, B0, 0x10
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D4, U0, U5, D4
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
addi.d A0, A0, 0x20
addi.d B0, B0, 0x10
.endm
.macro KERNEL2x4x2_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D4, U8, U13, D4
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x20
addi.d B0, B0, 0x10
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D4, U0, U5, D4
.endm
.macro KERNEL8x4x2
.rept 4
KERNEL2x4x2
.endr
.endm
.macro KERNEL8x4x2_END
.rept 3
KERNEL2x4x2
.endr
KERNEL2x4x2_END
.endm
.macro KERNEL2x2x2
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D4, U8, U13, D4
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x10
addi.d B0, B0, 0x10
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D4, U0, U5, D4
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
addi.d A0, A0, 0x10
addi.d B0, B0, 0x10
.endm
.macro KERNEL2x2x2_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D4, U8, U13, D4
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x10
addi.d B0, B0, 0x10
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D4, U0, U5, D4
.endm
.macro KERNEL8x2x2
.rept 4
KERNEL2x2x2
.endr
.endm
.macro KERNEL8x2x2_END
.rept 3
KERNEL2x2x2
.endr
KERNEL2x2x2_END
.endm
.macro KERNEL2x1x2
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D4, U8, U13, D4
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x08
addi.d B0, B0, 0x10
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D4, U0, U5, D4
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
addi.d A0, A0, 0x08
addi.d B0, B0, 0x10
.endm
.macro KERNEL2x1x2_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D4, U8, U13, D4
xvldrepl.d U4, B0, 0x00
xvldrepl.d U5, B0, 0x08
addi.d A0, A0, 0x08
addi.d B0, B0, 0x10
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D4, U0, U5, D4
.endm
.macro KERNEL8x1x2
.rept 4
KERNEL2x1x2
.endr
.endm
.macro KERNEL8x1x2_END
.rept 3
KERNEL2x1x2
.endr
KERNEL2x1x2_END
.endm
.macro KERNEL2x16x1
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvfmadd.d D2, U10, U12, D2
xvfmadd.d D3, U11, U12, D3
xvld U2, A0, 0x40
xvld U3, A0, 0x60
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x80
addi.d B0, B0, 0x08
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvld U9, A0, 0x20
xvfmadd.d D2, U2, U4, D2
xvfmadd.d D3, U3, U4, D3
xvld U10, A0, 0x40
xvld U11, A0, 0x60
xvldrepl.d U12, B0, 0x00
addi.d A0, A0, 0x80
addi.d B0, B0, 0x08
.endm
.macro KERNEL2x16x1_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvfmadd.d D2, U10, U12, D2
xvfmadd.d D3, U11, U12, D3
xvld U2, A0, 0x40
xvld U3, A0, 0x60
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x80
addi.d B0, B0, 0x08
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvfmadd.d D2, U2, U4, D2
xvfmadd.d D3, U3, U4, D3
.endm
.macro KERNEL8x16x1
.rept 4
KERNEL2x16x1
.endr
.endm
.macro KERNEL8x16x1_END
.rept 3
KERNEL2x16x1
.endr
KERNEL2x16x1_END
.endm
.macro KERNEL2x8x1
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x40
addi.d B0, B0, 0x08
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvld U9, A0, 0x20
xvldrepl.d U12, B0, 0x00
addi.d A0, A0, 0x40
addi.d B0, B0, 0x08
.endm
.macro KERNEL2x8x1_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvfmadd.d D1, U9, U12, D1
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x40
addi.d B0, B0, 0x08
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
.endm
.macro KERNEL8x8x1
.rept 4
KERNEL2x8x1
.endr
.endm
.macro KERNEL8x8x1_END
.rept 3
KERNEL2x8x1
.endr
KERNEL2x8x1_END
.endm
.macro KERNEL2x4x1
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x20
addi.d B0, B0, 0x08
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvldrepl.d U12, B0, 0x00
addi.d A0, A0, 0x20
addi.d B0, B0, 0x08
.endm
.macro KERNEL2x4x1_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x20
addi.d B0, B0, 0x08
xvfmadd.d D0, U0, U4, D0
.endm
.macro KERNEL8x4x1
.rept 4
KERNEL2x4x1
.endr
.endm
.macro KERNEL8x4x1_END
.rept 3
KERNEL2x4x1
.endr
KERNEL2x4x1_END
.endm
.macro KERNEL2x2x1
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x10
addi.d B0, B0, 0x08
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvldrepl.d U12, B0, 0x00
addi.d A0, A0, 0x10
addi.d B0, B0, 0x08
.endm
.macro KERNEL2x2x1_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x10
addi.d B0, B0, 0x08
xvfmadd.d D0, U0, U4, D0
.endm
.macro KERNEL8x2x1
.rept 4
KERNEL2x2x1
.endr
.endm
.macro KERNEL8x2x1_END
.rept 3
KERNEL2x2x1
.endr
KERNEL2x2x1_END
.endm
.macro KERNEL2x1x1
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x08
addi.d B0, B0, 0x08
xvld U8, A0, 0x00
xvfmadd.d D0, U0, U4, D0
xvldrepl.d U12, B0, 0x00
addi.d A0, A0, 0x08
addi.d B0, B0, 0x08
.endm
.macro KERNEL2x1x1_END
xvld U0, A0, 0x00
xvfmadd.d D0, U8, U12, D0
xvldrepl.d U4, B0, 0x00
addi.d A0, A0, 0x08
addi.d B0, B0, 0x08
xvfmadd.d D0, U0, U4, D0
.endm
.macro KERNEL8x1x1
.rept 4
KERNEL2x1x1
.endr
.endm
.macro KERNEL8x1x1_END
.rept 3
KERNEL2x1x1
.endr
KERNEL2x1x1_END
.endm
.macro dgemm_16x4
.L_dgemm_16x4: // See dgemm_kernel_16x4.S
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvld U2, A0, 0x40
xvld U3, A0, 0x60
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvfmul.d D1, U1, U4
xvfmul.d D2, U2, U4
xvfmul.d D3, U3, U4
xvldrepl.d U5, B0, 0x08
/* line 2 */
xvfmul.d D4, U0, U5
xvfmul.d D5, U1, U5
xvfmul.d D6, U2, U5
xvfmul.d D7, U3, U5
xvldrepl.d U6, B0, 0x10
/* line 3 */
xvfmul.d D8, U0, U6
xvfmul.d D9, U1, U6
xvfmul.d D10, U2, U6
xvfmul.d D11, U3, U6
xvldrepl.d U7, B0, 0x18
/* line 4 */
xvfmul.d D12, U0, U7
xvfmul.d D13, U1, U7
xvfmul.d D14, U2, U7
xvfmul.d D15, U3, U7
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x80
PTR_ADDI B0, B0, 0x20
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_L7 */
beq ZERO,TL, .L_dgemm_16x4_L7
xvld U8, A0, 0x00
xvld U9, A0, 0x20
xvld U10, A0, 0x40
xvld U11, A0, 0x60
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
xvldrepl.d U14, B0, 0x10
xvldrepl.d U15, B0, 0x18
PTR_ADDI A0, A0, 0x80
PTR_ADDI B0, B0, 0x20
beq ZERO, TL, .L_dgemm_16x4_TL1_END
.align 5
.L_dgemm_16x4_TL1:
KERNEL8x16x4
PTR_ADDI TL, TL, -1
blt ZERO, TL, .L_dgemm_16x4_TL1
.L_dgemm_16x4_TL1_END:
KERNEL8x16x4_END
.L_dgemm_16x4_L7:
andi TL, L, 7
beq TL, ZERO, .L_dgemm_16x4_L0
.align 5
.L_dgemm_16x4_L71:
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvld U2, A0, 0x40
xvld U3, A0, 0x60
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvfmadd.d D2, U2, U4, D2
xvfmadd.d D3, U3, U4, D3
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvfmadd.d D6, U2, U5, D6
xvfmadd.d D7, U3, U5, D7
xvldrepl.d U6, B0, 0x10
xvfmadd.d D8, U0, U6, D8
xvfmadd.d D9, U1, U6, D9
xvfmadd.d D10, U2, U6, D10
xvfmadd.d D11, U3, U6, D11
xvldrepl.d U7, B0, 0x18
xvfmadd.d D12, U0, U7, D12
xvfmadd.d D13, U1, U7, D13
xvfmadd.d D14, U2, U7, D14
xvfmadd.d D15, U3, U7, D15
PTR_ADDI A0, A0, 0x80
PTR_ADDI B0, B0, 0x20
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_16x4_L71
.L_dgemm_16x4_L0:
// Load C
GLD xv, , U0, C0, 0x00, U1, C0, 0x20, U2, C0, 0x40, U3, C0, 0x60
GLD xv, , U4, C1, 0x00, U5, C1, 0x20, U6, C1, 0x40, U7, C1, 0x60
GLD xv, , U8, C2, 0x00, U9, C2, 0x20, U10, C2, 0x40, U11, C2, 0x60
GLD xv, , U12, C3, 0x00, U13, C3, 0x20, U14, C3, 0x40, U15, C3, 0x60
GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \
U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7, \
U8, U8, D8, U9, U9, D9, U10, U10, D10, U11, U11, D11, \
U12, U12, D12, U13, U13, D13, U14, U14, D14, U15, U15, D15
.endm
.macro dgemm_1x4
.L_dgemm_1x4: // See dgemm_kernel_16x4.S
xvldrepl.d U0, A0, 0x00
xvld U4, B0, 0x00
xvfmul.d D0, U0, U4
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x08
PTR_ADDI B0, B0, 0x20
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_M1_L7 */
beq ZERO,TL, .L_dgemm_1x4_M1_L7
xvldrepl.d U8, A0, 0x00
PTR_ADDI TL, TL, -1
xvld U12, B0, 0x00
PTR_ADDI A0, A0, 0x08
PTR_ADDI B0, B0, 0x20
beq ZERO, TL, .L_dgemm_1x4_M1_TL1_END
.align 5
.L_dgemm_1x4_M1_TL1:
KERNEL8x1x4
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_1x4_M1_TL1
.L_dgemm_1x4_M1_TL1_END:
KERNEL8x1x4_END
.L_dgemm_1x4_M1_L7:
/* if (!(L & 7)) goto L_M1_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_1x4_M1_L0
.align 5
.L_dgemm_1x4_M1_L71:
xvldrepl.d U0, A0, 0x00
xvld U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x08
PTR_ADDI B0, B0, 0x20
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_1x4_M1_L71
.L_dgemm_1x4_M1_L0:
// Load C
fld.d $f0, C0, 0x00
fld.d $f1, C1, 0x00
fld.d $f2, C2, 0x00
fld.d $f3, C3, 0x00
xvinsve0.d U0, U1, 0x01
xvinsve0.d U0, U2, 0x02
xvinsve0.d U0, U3, 0x03
GSUB xvf, d, U0, U0, D0
.endm
.macro dgemm_2x4
.L_dgemm_2x4:
/* Load 2 * 64 from A0 */
xvldrepl.d U0, A0, 0x00
xvldrepl.d U1, A0, 0x08
xvld U4, B0, 0x00
xvfmul.d D0, U0, U4
xvfmul.d D1, U1, U4
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x10
PTR_ADDI B0, B0, 0x20
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_M2_L7 */
beq ZERO,TL, .L_dgemm_2x4_M2_L7
xvldrepl.d U8, A0, 0x00
xvldrepl.d U9, A0, 0x08
PTR_ADDI TL, TL, -1
xvld U12, B0, 0x00
PTR_ADDI A0, A0, 0x10
PTR_ADDI B0, B0, 0x20
beq ZERO, TL, .L_dgemm_2x4_M2_TL1_END
.align 5
.L_dgemm_2x4_M2_TL1:
KERNEL8x2x4
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_2x4_M2_TL1
.L_dgemm_2x4_M2_TL1_END:
KERNEL8x2x4_END
.L_dgemm_2x4_M2_L7:
/* if (!(L & 7)) goto L_M2_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_2x4_M2_L0
.align 5
.L_dgemm_2x4_M2_L71:
xvldrepl.d U0, A0, 0x00
xvldrepl.d U1, A0, 0x08
xvld U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x10
PTR_ADDI B0, B0, 0x20
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_2x4_M2_L71
.L_dgemm_2x4_M2_L0:
xvpackev.d D4, D1, D0
xvpackod.d D5, D1, D0
/* Load C0 */
xvld U0, C0, 0x00
/* Load C1 */
xvld U1, C1, 0x00
/* Load C2 */
xvld U2, C2, 0x00
/* Load C3 */
xvld U3, C3, 0x00
xvpermi.q U0, U2, 0x02
xvpermi.q U1, U3, 0x02
GSUB xvf, d, U0, U0, D4, U1, U1, D5
.endm
.macro dgemm_4x4
.L_dgemm_4x4:
/* Load 4 * 64 from A0 */
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvldrepl.d U5, B0, 0x08
/* line 2 */
xvfmul.d D4, U0, U5
xvldrepl.d U6, B0, 0x10
/* line 3 */
xvfmul.d D8, U0, U6
xvldrepl.d U7, B0, 0x18
/* line 4 */
xvfmul.d D12, U0, U7
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x20
PTR_ADDI B0, B0, 0x20
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_M4_L7 */
beq ZERO,TL, .L_dgemm_4x4_M4_L7
xvld U8, A0, 0x00
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
xvldrepl.d U14, B0, 0x10
xvldrepl.d U15, B0, 0x18
PTR_ADDI A0, A0, 0x20
PTR_ADDI B0, B0, 0x20
beq ZERO, TL, .L_dgemm_4x4_M4_TL1_END
.align 5
.L_dgemm_4x4_M4_TL1: /* TL-- */
KERNEL8x4x4
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_4x4_M4_TL1
.L_dgemm_4x4_M4_TL1_END:
KERNEL8x4x4_END
.L_dgemm_4x4_M4_L7:
/* if (!(L & 7)) goto L_M4_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_4x4_M4_L0
.align 5
.L_dgemm_4x4_M4_L71:
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvldrepl.d U4, B0, 0x08
xvfmadd.d D4, U0, U4, D4
xvldrepl.d U4, B0, 0x10
xvfmadd.d D8, U0, U4, D8
xvldrepl.d U4, B0, 0x18
xvfmadd.d D12, U0, U4, D12
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x20
PTR_ADDI B0, B0, 0x20
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_4x4_M4_L71
.L_dgemm_4x4_M4_L0:
/* Load C0 */
xvld U0, C0, 0x00
/* Load C1 */
xvld U1, C1, 0x00
/* Load C2 */
xvld U2, C2, 0x00
/* Load C3 */
xvld U3, C3, 0x00
GSUB xvf, d, U0, U0, D0, U1, U1, D4, U2, U2, D8, U3, U3, D12
.endm
.macro dgemm_8x4
.L_dgemm_8x4:
/* Load 8 * 64 from A0 */
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvfmul.d D1, U1, U4
xvldrepl.d U5, B0, 0x08
/* line 2 */
xvfmul.d D4, U0, U5
xvfmul.d D5, U1, U5
xvldrepl.d U6, B0, 0x10
/* line 3 */
xvfmul.d D8, U0, U6
xvfmul.d D9, U1, U6
xvldrepl.d U7, B0, 0x18
/* line 4 */
xvfmul.d D12, U0, U7
xvfmul.d D13, U1, U7
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x20
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_M8_L7 */
beq ZERO,TL, .L_dgemm_8x4_M8_L7
xvld U8, A0, 0x00
xvld U9, A0, 0x20
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
xvldrepl.d U14, B0, 0x10
xvldrepl.d U15, B0, 0x18
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x20
beq ZERO, TL, .L_dgemm_8x4_M8_TL1_END
.align 5
.L_dgemm_8x4_M8_TL1: /* TL-- */
KERNEL8x8x4
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_8x4_M8_TL1
.L_dgemm_8x4_M8_TL1_END:
KERNEL8x8x4_END
.L_dgemm_8x4_M8_L7:
/* if (!(L & 7)) goto L_M8_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_8x4_M8_L0
.align 5
.L_dgemm_8x4_M8_L71:
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvldrepl.d U6, B0, 0x10
xvfmadd.d D8, U0, U6, D8
xvfmadd.d D9, U1, U6, D9
xvldrepl.d U7, B0, 0x18
xvfmadd.d D12, U0, U7, D12
xvfmadd.d D13, U1, U7, D13
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x20
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_8x4_M8_L71
.L_dgemm_8x4_M8_L0:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
/* Load C1 */
xvld U2, C1, 0x00
xvld U3, C1, 0x20
/* Load C2 */
xvld U4, C2, 0x00
xvld U5, C2, 0x20
/* Load C3 */
xvld U6, C3, 0x00
xvld U7, C3, 0x20
GSUB xvf, d, U0, U0, D0, U1, U1, D1, \
U2, U2, D4, U3, U3, D5, \
U4, U4, D8, U5, U5, D9, \
U6, U6, D12, U7, U7, D13
.endm
.macro dgemm_4x2
.L_dgemm_4x2:
/* Load 4 * 64 from A0 */
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvldrepl.d U5, B0, 0x08
/* line 2 */
xvfmul.d D4, U0, U5
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x20
PTR_ADDI B0, B0, 0x10
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_dgemm_4x2_N3_M4_L7 */
beq ZERO,TL, .L_dgemm_4x2_N3_M4_L7
xvld U8, A0, 0x00
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
PTR_ADDI A0, A0, 0x20
PTR_ADDI B0, B0, 0x10
beq ZERO, TL, .L_dgemm_4x2_N3_M4_TL1_END
.align 5
.L_dgemm_4x2_N3_M4_TL1: /* TL-- */
KERNEL8x4x2
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_4x2_N3_M4_TL1
.L_dgemm_4x2_N3_M4_TL1_END:
KERNEL8x4x2_END
.L_dgemm_4x2_N3_M4_L7:
/* if (!(L & 7)) goto L_dgemm_4x2_N3_M4_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_4x2_N3_M4_L0
.align 5
.L_dgemm_4x2_N3_M4_L71:
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U0, U5, D4
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x20
PTR_ADDI B0, B0, 0x10
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_4x2_N3_M4_L71
.L_dgemm_4x2_N3_M4_L0:
/* Load C0 */
xvld U0, C0, 0x00
/* Load C1 */
xvld U1, C1, 0x00
GSUB xvf, d, U0, U0, D0, U1, U1, D4
.endm
.macro dgemm_2x2
.L_dgemm_2x2:
/* Load 2 * 64 from A0 */
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvldrepl.d U4, B0, 0x08
/* line 2 */
xvfmul.d D4, U0, U4
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x10
PTR_ADDI B0, B0, 0x10
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_dgemm_2x2_N3_M2_L7 */
beq ZERO,TL, .L_dgemm_2x2_N3_M2_L7
xvld U8, A0, 0x00
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
PTR_ADDI A0, A0, 0x10
PTR_ADDI B0, B0, 0x10
beq ZERO, TL, .L_dgemm_2x2_N3_M2_TL1_END
.align 5
.L_dgemm_2x2_N3_M2_TL1: /* TL-- */
KERNEL8x2x2
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_2x2_N3_M2_TL1
.L_dgemm_2x2_N3_M2_TL1_END:
KERNEL8x2x2_END
.L_dgemm_2x2_N3_M2_L7:
/* if (!(L & 7)) goto L_dgemm_2x2_N3_M2_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_2x2_N3_M2_L0
.align 5
.L_dgemm_2x2_N3_M2_L71:
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U0, U5, D4
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x10
PTR_ADDI B0, B0, 0x10
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_2x2_N3_M2_L71
.L_dgemm_2x2_N3_M2_L0:
/* Load C0 */
xvld U0, C0, 0x00
/* Load C1 */
xvld U1, C1, 0x00
GSUB xvf, d, U0, U0, D0, U1, U1, D4
.endm
.macro dgemm_8x2
.L_dgemm_8x2:
/* Load 8 * 64 from A0 */
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvfmul.d D1, U1, U4
xvldrepl.d U5, B0, 0x08
/* line 2 */
xvfmul.d D4, U0, U5
xvfmul.d D5, U1, U5
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x10
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_dgemm_8x2_N3_M8_L7 */
beq ZERO,TL, .L_dgemm_8x2_N3_M8_L7
xvld U8, A0, 0x00
xvld U9, A0, 0x20
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x10
beq ZERO, TL, .L_dgemm_8x2_N3_M8_TL1_END
.align 5
.L_dgemm_8x2_N3_M8_TL1: /* TL-- */
KERNEL8x8x2
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_8x2_N3_M8_TL1
.L_dgemm_8x2_N3_M8_TL1_END:
KERNEL8x8x2_END
.L_dgemm_8x2_N3_M8_L7:
/* if (!(L & 7)) goto L_dgemm_8x2_N3_M8_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_8x2_N3_M8_L0
.align 5
.L_dgemm_8x2_N3_M8_L71:
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x10
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_8x2_N3_M8_L71
.L_dgemm_8x2_N3_M8_L0:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
/* Load C1 */
xvld U2, C1, 0x00
xvld U3, C1, 0x20
GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D4, U3, U3, D5
.endm
.macro dgemm_16x2
.L_dgemm_16x2:
/* Load 16 * 64 from A0
* U0 = {a3, a2, a1, a0}
* U1 = {a7, a6, a5, a4}
* U2 = {a11, a10, a9, a8}
* U3 = {a15, a14, a13, a12}
*/
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvld U2, A0, 0x40
xvld U3, A0, 0x60
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvfmul.d D1, U1, U4
xvfmul.d D2, U2, U4
xvfmul.d D3, U3, U4
xvldrepl.d U5, B0, 0x08
/* line 2 */
xvfmul.d D4, U0, U5
xvfmul.d D5, U1, U5
xvfmul.d D6, U2, U5
xvfmul.d D7, U3, U5
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x80
PTR_ADDI B0, B0, 0x10
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N3_L7 */
beq ZERO,TL, .L_dgemm_16x2_N3_L7
xvld U8, A0, 0x00
xvld U9, A0, 0x20
xvld U10, A0, 0x40
xvld U11, A0, 0x60
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
PTR_ADDI A0, A0, 0x80
PTR_ADDI B0, B0, 0x10
beq ZERO, TL, .L_dgemm_16x2_N3_TL1_END
.align 5
.L_dgemm_16x2_N3_TL1: /* TL-- */
KERNEL8x16x2
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_16x2_N3_TL1
.L_dgemm_16x2_N3_TL1_END:
KERNEL8x16x2_END
.L_dgemm_16x2_N3_L7:
/* if (!(L & 7)) goto L_dgemm_16x2_N3_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_16x2_N3_L0
.align 5
.L_dgemm_16x2_N3_L71:
/* Load 16 * 64 from A0 */
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvld U2, A0, 0x40
xvld U3, A0, 0x60
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvfmadd.d D2, U2, U4, D2
xvfmadd.d D3, U3, U4, D3
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U0, U5, D4
xvfmadd.d D5, U1, U5, D5
xvfmadd.d D6, U2, U5, D6
xvfmadd.d D7, U3, U5, D7
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x80
PTR_ADDI B0, B0, 0x10
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_16x2_N3_L71
.L_dgemm_16x2_N3_L0:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
xvld U2, C0, 0x40
xvld U3, C0, 0x60
/* Load C1 */
xvld U4, C1, 0x00
xvld U5, C1, 0x20
xvld U6, C1, 0x40
xvld U7, C1, 0x60
GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3, \
U4, U4, D4, U5, U5, D5, U6, U6, D6, U7, U7, D7
.endm
.macro dgemm_2x1
.L_dgemm_2x1:
/* Load 2 * 64 from A0 */
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x10
PTR_ADDI B0, B0, 0x08
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_dgemm_2x1_N1_M2_L7 */
beq ZERO,TL, .L_dgemm_2x1_N1_M2_L7
xvld U8, A0, 0x00
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
PTR_ADDI A0, A0, 0x10
PTR_ADDI B0, B0, 0x08
beq ZERO, TL, .L_dgemm_2x1_N1_M2_TL1_END
.align 5
.L_dgemm_2x1_N1_M2_TL1: /* TL-- */
KERNEL8x2x1
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_2x1_N1_M2_TL1
.L_dgemm_2x1_N1_M2_TL1_END:
KERNEL8x2x1_END
.L_dgemm_2x1_N1_M2_L7:
/* if (!(L & 7)) goto L_dgemm_2x1_N1_M2_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_2x1_N1_M2_L0
.align 5
.L_dgemm_2x1_N1_M2_L71:
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x10
PTR_ADDI B0, B0, 0x08
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_2x1_N1_M2_L71
.L_dgemm_2x1_N1_M2_L0:
/* Load C0 */
xvld U0, C0, 0x00
GSUB xvf, d, U0, U0, D0
.endm
.macro dgemm_4x1
.L_dgemm_4x1:
/* Load 4 * 64 from A0 */
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x20
PTR_ADDI B0, B0, 0x08
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_dgemm_4x1_N1_M4_L7 */
beq ZERO,TL, .L_dgemm_4x1_N1_M4_L7
xvld U8, A0, 0x00
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
PTR_ADDI A0, A0, 0x20
PTR_ADDI B0, B0, 0x08
beq ZERO, TL, .L_dgemm_4x1_N1_M4_TL1_END
.align 5
.L_dgemm_4x1_N1_M4_TL1: /* TL-- */
KERNEL8x4x1
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_4x1_N1_M4_TL1
.L_dgemm_4x1_N1_M4_TL1_END:
KERNEL8x4x1_END
.L_dgemm_4x1_N1_M4_L7:
/* if (!(L & 7)) goto L_dgemm_4x1_N1_M4_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_4x1_N1_M4_L0
.align 5
.L_dgemm_4x1_N1_M4_L71:
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x20
PTR_ADDI B0, B0, 0x08
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_4x1_N1_M4_L71
.L_dgemm_4x1_N1_M4_L0:
/* Load C0 */
xvld U0, C0, 0x00
GSUB xvf, d, U0, U0, D0
.endm
.macro dgemm_8x1
.L_dgemm_8x1:
/* Load 8 * 64 from A0 */
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvfmul.d D1, U1, U4
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x08
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_dgemm_8x1_N1_M8_L7 */
beq ZERO,TL, .L_dgemm_8x1_N1_M8_L7
xvld U8, A0, 0x00
xvld U9, A0, 0x20
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x08
beq ZERO, TL, .L_dgemm_8x1_N1_M8_TL1_END
.align 5
.L_dgemm_8x1_N1_M8_TL1: /* TL-- */
KERNEL8x8x1
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_8x1_N1_M8_TL1
.L_dgemm_8x1_N1_M8_TL1_END:
KERNEL8x8x1_END
.L_dgemm_8x1_N1_M8_L7:
/* if (!(L & 7)) goto L_dgemm_8x1_N1_M8_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_8x1_N1_M8_L0
.align 5
.L_dgemm_8x1_N1_M8_L71:
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x40
PTR_ADDI B0, B0, 0x08
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_8x1_N1_M8_L71
.L_dgemm_8x1_N1_M8_L0:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
GSUB xvf, d, U0, U0, D0, U1, U1, D1
.endm
.macro dgemm_16x1
.L_dgemm_16x1:
/* Load 16 * 64 from A0
* U0 = {a3, a2, a1, a0}
* U1 = {a7, a6, a5, a4}
* U2 = {a11, a10, a9, a8}
* U3 = {a15, a14, a13, a12}
*/
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvld U2, A0, 0x40
xvld U3, A0, 0x60
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvfmul.d D1, U1, U4
xvfmul.d D2, U2, U4
xvfmul.d D3, U3, U4
/* Add stride for A0 and B0 */
PTR_ADDI A0, A0, 0x80
PTR_ADDI B0, B0, 0x08
/* Reduce L */
PTR_ADDI L, L, -1
PTR_SRAI TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_dgemm_16x1_N1_L7 */
beq ZERO,TL, .L_dgemm_16x1_N1_L7
xvld U8, A0, 0x00
xvld U9, A0, 0x20
xvld U10, A0, 0x40
xvld U11, A0, 0x60
PTR_ADDI TL, TL, -1
xvldrepl.d U12, B0, 0x00
PTR_ADDI A0, A0, 0x80
PTR_ADDI B0, B0, 0x08
beq ZERO, TL, .L_dgemm_16x1_N1_TL1_END
.align 5
.L_dgemm_16x1_N1_TL1: /* TL-- */
KERNEL8x16x1
PTR_ADDI TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_16x1_N1_TL1
.L_dgemm_16x1_N1_TL1_END:
KERNEL8x16x1_END
.L_dgemm_16x1_N1_L7:
/* if (!(L & 7)) goto L_dgemm_16x1_N1_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_16x1_N1_L0
.align 5
.L_dgemm_16x1_N1_L71:
/* Load 16 * 64 from A0 */
xvld U0, A0, 0x00
xvld U1, A0, 0x20
xvld U2, A0, 0x40
xvld U3, A0, 0x60
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvfmadd.d D1, U1, U4, D1
xvfmadd.d D2, U2, U4, D2
xvfmadd.d D3, U3, U4, D3
/* Add stride for A0, B0 */
PTR_ADDI A0, A0, 0x80
PTR_ADDI B0, B0, 0x08
PTR_ADDI TL, TL, -1
blt ZERO,TL, .L_dgemm_16x1_N1_L71
.L_dgemm_16x1_N1_L0:
/* Load C0 */
xvld U0, C0, 0x00
xvld U1, C0, 0x20
xvld U2, C0, 0x40
xvld U3, C0, 0x60
GSUB xvf, d, U0, U0, D0, U1, U1, D1, U2, U2, D2, U3, U3, D3
.endm
.macro dgemm_1x2
.L_dgemm_1x2: // See dgemm_kernel_16x4.S
/* Load 1 * 64 from A0 */
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
xvldrepl.d U4, B0, 0x08
/* line 2 */
xvfmul.d D4, U0, U4
/* Add stride for A0 and B0 */
addi.d A0, A0, 0x08
addi.d B0, B0, 0x10
/* Reduce L */
addi.d L, L, -1
srai.d TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N3_M1_L7 */
beq ZERO,TL, .L_dgemm_1x2_N3_M1_L7
xvld U8, A0, 0x00
addi.d TL, TL, -1
xvldrepl.d U12, B0, 0x00
xvldrepl.d U13, B0, 0x08
addi.d A0, A0, 0x08
addi.d B0, B0, 0x10
beq ZERO, TL, .L_dgemm_1x2_N3_M1_TL1_END
.L_dgemm_1x2_N3_M1_TL1: /* TL-- */
KERNEL8x1x2
addi.d TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_dgemm_1x2_N3_M1_TL1
.L_dgemm_1x2_N3_M1_TL1_END:
KERNEL8x1x2_END
.L_dgemm_1x2_N3_M1_L7:
/* if (!(L & 7)) goto L_dgemm_1x2_N3_M1_L0 */
andi TL, L, 7
beq TL, ZERO,.L_dgemm_1x2_N3_M1_L0
.L_dgemm_1x2_N3_M1_L71:
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
xvldrepl.d U5, B0, 0x08
xvfmadd.d D4, U0, U5, D4
/* Add stride for A0, B0 */
addi.d A0, A0, 0x08
addi.d B0, B0, 0x10
addi.d TL, TL, -1
blt ZERO,TL, .L_dgemm_1x2_N3_M1_L71
.L_dgemm_1x2_N3_M1_L0:
xvld U0, C0, 0x00
xvld U1, C1, 0x00
xvinsve0.d U0, U1, 0x01
xvinsve0.d D0, D4, 0x01
GSUB xvf, d, U0, U0, D0
.endm
.macro dgemm_1x1
.L_dgemm_1x1:
/* Load 1 * 64 from A0 */
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
/* line 1 */
xvfmul.d D0, U0, U4
/* Add stride for A0 and B0 */
addi.d A0, A0, 0x08
addi.d B0, B0, 0x08
/* Reduce L */
addi.d L, L, -1
srai.d TL, L, 3 /* TL = (L-1) >> 3 */
/* if (TL < 1) goto L_N1_M1_L7 */
beq ZERO,TL, .L_N1_M1_L7
xvld U8, A0, 0x00
addi.d TL, TL, -1
xvldrepl.d U12, B0, 0x00
addi.d A0, A0, 0x08
addi.d B0, B0, 0x08
beq ZERO, TL, .L_N1_M1_TL1_END
.L_N1_M1_TL1: /* TL-- */
KERNEL8x1x1
addi.d TL, TL, -1 /* TL-- */
blt ZERO,TL, .L_N1_M1_TL1
.L_N1_M1_TL1_END:
KERNEL8x1x1_END
.L_N1_M1_L7:
/* if (!(L & 7)) goto L_N1_M1_L0 */
andi TL, L, 7
beq TL, ZERO,.L_N1_M1_L0
.L_N1_M1_L71:
xvld U0, A0, 0x00
xvldrepl.d U4, B0, 0x00
xvfmadd.d D0, U0, U4, D0
/* Add stride for A0, B0 */
addi.d A0, A0, 0x08
addi.d B0, B0, 0x08
addi.d TL, TL, -1
blt ZERO,TL, .L_N1_M1_L71
.L_N1_M1_L0:
/* Load C0 */
xvld U0, C0, 0x00
GSUB xvf, d, U0, U0, D0
.endm