5575 lines
136 KiB
ArmAsm
5575 lines
136 KiB
ArmAsm
/***************************************************************************
|
|
Copyright (c) 2013-2019, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*****************************************************************************/
|
|
|
|
#define unit_size 4
|
|
#define DISP64(ind,disp) (ind*unit_size*64+disp)
|
|
#define DISP32(ind,disp) (ind*unit_size*32+disp)
|
|
#define DISP16(ind,disp) (ind*unit_size*16+disp)
|
|
#define DISP8(ind,disp) (ind*unit_size*8+disp)
|
|
#define DISP4(ind,disp) (ind*unit_size*4+disp)
|
|
#define DISP2(ind,disp) (ind*unit_size*2+disp)
|
|
#define DISP1(ind,disp) (ind*unit_size+disp)
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=8 and M=16
|
|
**********************************************************************************************/
|
|
|
|
|
|
|
|
.macro KERNEL8x16_L1_L4 Index,IsLast
|
|
KERNEL8x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x16_I1_L4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro KERNEL8x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro Zero8X16
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
xxlxor vs34, vs34, vs34
|
|
xxlxor vs35, vs35, vs35
|
|
xxlxor vs36, vs36, vs36
|
|
xxlxor vs37, vs37, vs37
|
|
xxlxor vs38, vs38, vs38
|
|
xxlxor vs39, vs39, vs39
|
|
xxlxor vs40, vs40, vs40
|
|
xxlxor vs41, vs41, vs41
|
|
xxlxor vs42, vs42, vs42
|
|
xxlxor vs43, vs43, vs43
|
|
xxlxor vs44, vs44, vs44
|
|
xxlxor vs45, vs45, vs45
|
|
xxlxor vs46, vs46, vs46
|
|
xxlxor vs47, vs47, vs47
|
|
xxlxor vs48, vs48, vs48
|
|
xxlxor vs49, vs49, vs49
|
|
xxlxor vs50, vs50, vs50
|
|
xxlxor vs51, vs51, vs51
|
|
xxlxor vs52, vs52, vs52
|
|
xxlxor vs53, vs53, vs53
|
|
xxlxor vs54, vs54, vs54
|
|
xxlxor vs55, vs55, vs55
|
|
xxlxor vs56, vs56, vs56
|
|
xxlxor vs57, vs57, vs57
|
|
xxlxor vs58, vs58, vs58
|
|
xxlxor vs59, vs59, vs59
|
|
xxlxor vs60, vs60, vs60
|
|
xxlxor vs61, vs61, vs61
|
|
xxlxor vs62, vs62, vs62
|
|
xxlxor vs63, vs63, vs63
|
|
.endm
|
|
|
|
.macro LOAD8x16 OffsetA,OffsetB
|
|
|
|
lxv vs24, (\OffsetB+0)(BO)
|
|
lxv vs28, (\OffsetB+16)(BO)
|
|
xxperm vs26, vs24, permute_mask
|
|
xxperm vs30, vs28, permute_mask
|
|
lxv vs0, (\OffsetA+0)(AO)
|
|
lxv vs1, (\OffsetA+16)(AO)
|
|
xxpermdi vs25, vs24, vs24,2
|
|
xxpermdi vs29, vs28, vs28,2
|
|
lxv vs2, (\OffsetA+32)(AO)
|
|
lxv vs3, (\OffsetA+48)(AO)
|
|
xxpermdi vs27, vs26, vs26,2
|
|
xxpermdi vs31, vs30, vs30,2
|
|
|
|
.endm
|
|
|
|
.macro END8x16_NORMAL
|
|
END8x16 0, AO, BO, 64,32
|
|
.endm
|
|
|
|
.macro END8x16_WITHOUT_ADD
|
|
END8x16 0, AO,BO,0,0
|
|
.endm
|
|
|
|
.macro END8x16 First, AREG, BREG, OffsetA, OffsetB
|
|
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs0,vs24
|
|
xvmulsp vs33, vs1,vs24
|
|
xvmulsp vs34, vs2,vs24
|
|
xvmulsp vs35, vs3,vs24
|
|
|
|
xvmulsp vs36, vs0,vs25
|
|
xvmulsp vs37, vs1,vs25
|
|
xvmulsp vs38, vs2,vs25
|
|
xvmulsp vs39, vs3,vs25
|
|
|
|
xvmulsp vs40, vs0,vs26
|
|
xvmulsp vs41, vs1,vs26
|
|
xvmulsp vs42, vs2,vs26
|
|
xvmulsp vs43, vs3,vs26
|
|
|
|
xvmulsp vs44, vs0,vs27
|
|
xvmulsp vs45, vs1,vs27
|
|
xvmulsp vs46, vs2,vs27
|
|
xvmulsp vs47, vs3,vs27
|
|
|
|
xvmulsp vs48, vs0,vs28
|
|
xvmulsp vs49, vs1,vs28
|
|
xvmulsp vs50, vs2,vs28
|
|
xvmulsp vs51, vs3,vs28
|
|
|
|
xvmulsp vs52, vs0,vs29
|
|
xvmulsp vs53, vs1,vs29
|
|
xvmulsp vs54, vs2,vs29
|
|
xvmulsp vs55, vs3,vs29
|
|
|
|
xvmulsp vs56, vs0,vs30
|
|
xvmulsp vs57, vs1,vs30
|
|
xvmulsp vs58, vs2,vs30
|
|
xvmulsp vs59, vs3,vs30
|
|
|
|
xvmulsp vs60, vs0,vs31
|
|
xvmulsp vs61, vs1,vs31
|
|
xvmulsp vs62, vs2,vs31
|
|
xvmulsp vs63, vs3,vs31
|
|
|
|
.else
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
xvmaddasp vs34, vs2,vs24
|
|
xvmaddasp vs35, vs3,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
xvmaddasp vs38, vs2,vs25
|
|
xvmaddasp vs39, vs3,vs25
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
xvmaddasp vs42, vs2,vs26
|
|
xvmaddasp vs43, vs3,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
xvmaddasp vs46, vs2,vs27
|
|
xvmaddasp vs47, vs3,vs27
|
|
|
|
xvmaddasp vs48, vs0,vs28
|
|
xvmaddasp vs49, vs1,vs28
|
|
xvmaddasp vs50, vs2,vs28
|
|
xvmaddasp vs51, vs3,vs28
|
|
|
|
xvmaddasp vs52, vs0,vs29
|
|
xvmaddasp vs53, vs1,vs29
|
|
xvmaddasp vs54, vs2,vs29
|
|
xvmaddasp vs55, vs3,vs29
|
|
|
|
xvmaddasp vs56, vs0,vs30
|
|
xvmaddasp vs57, vs1,vs30
|
|
xvmaddasp vs58, vs2,vs30
|
|
xvmaddasp vs59, vs3,vs30
|
|
|
|
xvmaddasp vs60, vs0,vs31
|
|
xvmaddasp vs61, vs1,vs31
|
|
xvmaddasp vs62, vs2,vs31
|
|
xvmaddasp vs63, vs3,vs31
|
|
|
|
.endif
|
|
.endm
|
|
|
|
.macro KERNEL8x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
KERNEL8x16_2 \AREG,\BREG, \OffsetA,\OffsetB, (\Index*2),0 ,0
|
|
KERNEL8x16_2 \AREG,\BREG,\OffsetA,\OffsetB, (\Index*2+1),\IsLast ,\Complete
|
|
|
|
.endm
|
|
|
|
.macro KERNEL8x16 First
|
|
|
|
LOAD8x16 0,0
|
|
END8x16 \First, AO, BO, 64,32
|
|
.endm
|
|
|
|
.macro LOAD8x16_2
|
|
LOAD8x16_2O AO,BO, 0,0
|
|
.endm
|
|
|
|
.macro LOAD8x16_2O AREG,BREG, OffsetA,OffsetB
|
|
lxv vs8, (\OffsetB)(\BREG)
|
|
lxv vs12, (16+\OffsetB)(\BREG)
|
|
lxv vs24, (32+\OffsetB)(\BREG)
|
|
lxv vs28, (32+16+\OffsetB)(\BREG)
|
|
lxv vs4, (0+\OffsetA)(\AREG)
|
|
lxv vs5, (16+\OffsetA)(\AREG)
|
|
xxperm vs10, vs8, permute_mask
|
|
xxperm vs14, vs12, permute_mask
|
|
lxv vs6, (32+\OffsetA)(\AREG)
|
|
lxv vs7, (48+\OffsetA)(\AREG)
|
|
xxpermdi vs9, vs8, vs8,2
|
|
xxpermdi vs13, vs12, vs12,2
|
|
lxv vs0, (64+\OffsetA)(\AREG)
|
|
lxv vs1, (64+16+\OffsetA)(\AREG)
|
|
xxpermdi vs11, vs10, vs10,2
|
|
xxpermdi vs15, vs14, vs14,2
|
|
lxv vs2, (64+32+\OffsetA)(\AREG)
|
|
lxv vs3, (64+48+\OffsetA)(\AREG)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxperm vs30, vs28, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
xxpermdi vs29, vs28, vs28,2
|
|
xxpermdi vs27, vs26, vs26,2
|
|
xxpermdi vs31, vs30, vs30,2
|
|
.endm
|
|
|
|
.macro END8x16_2
|
|
/*for load2 offset will be 128 and 64*/
|
|
KERNEL8x16_2 AO,BO, 128,64,0 ,1,1
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL8x16_E2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL8x16_L2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x16_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
|
|
.endm
|
|
|
|
|
|
.macro KERNEL8x16_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
xvmaddasp vs48, vs4,vs12
|
|
xvmaddasp vs49, vs5,vs12
|
|
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
xvmaddasp vs56, vs4,vs14
|
|
xvmaddasp vs57, vs5,vs14
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
xvmaddasp vs52, vs4,vs13
|
|
xvmaddasp vs53, vs5,vs13
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
xvmaddasp vs60, vs4,vs15
|
|
xvmaddasp vs61, vs5,vs15
|
|
|
|
.if \Complete==0
|
|
lxv vs4, DISP32(\Index,0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
|
|
.endif
|
|
|
|
xvmaddasp vs34, vs6,vs8
|
|
xvmaddasp vs35, vs7,vs8
|
|
xvmaddasp vs50, vs6,vs12
|
|
xvmaddasp vs51, vs7,vs12
|
|
.if \Complete==0
|
|
lxv vs8, DISP16(\Index,\OffsetB)(\BREG)
|
|
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
|
|
.endif
|
|
xvmaddasp vs42, vs6,vs10
|
|
xvmaddasp vs43, vs7,vs10
|
|
xvmaddasp vs58, vs6,vs14
|
|
xvmaddasp vs59, vs7,vs14
|
|
.if \Complete==0
|
|
xxperm vs10, vs8, permute_mask
|
|
xxperm vs14, vs12, permute_mask
|
|
.endif
|
|
xvmaddasp vs38, vs6,vs9
|
|
xvmaddasp vs39, vs7,vs9
|
|
xvmaddasp vs54, vs6,vs13
|
|
xvmaddasp vs55, vs7,vs13
|
|
.if \Complete==0
|
|
xxpermdi vs9, vs8, vs8,2
|
|
xxpermdi vs13, vs12, vs12,2
|
|
.endif
|
|
xvmaddasp vs46, vs6,vs11
|
|
xvmaddasp vs47, vs7,vs11
|
|
xvmaddasp vs62, vs6,vs15
|
|
xvmaddasp vs63, vs7,vs15
|
|
.if \Complete==0
|
|
xxpermdi vs11, vs10, vs10,2
|
|
xxpermdi vs15, vs14, vs14,2
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
|
|
.endif
|
|
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
xvmaddasp vs48, vs0,vs28
|
|
xvmaddasp vs49, vs1,vs28
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
xvmaddasp vs56, vs0,vs30
|
|
xvmaddasp vs57, vs1,vs30
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
xvmaddasp vs52, vs0,vs29
|
|
xvmaddasp vs53, vs1,vs29
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
xvmaddasp vs60, vs0,vs31
|
|
xvmaddasp vs61, vs1,vs31
|
|
.if \Complete==0
|
|
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
|
|
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
|
|
.endif
|
|
|
|
xvmaddasp vs34, vs2,vs24
|
|
xvmaddasp vs35, vs3,vs24
|
|
xvmaddasp vs50, vs2,vs28
|
|
xvmaddasp vs51, vs3,vs28
|
|
.if \Complete==0
|
|
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
|
|
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
|
|
.endif
|
|
xvmaddasp vs42, vs2,vs26
|
|
xvmaddasp vs43, vs3,vs26
|
|
xvmaddasp vs58, vs2,vs30
|
|
xvmaddasp vs59, vs3,vs30
|
|
.if \Complete==0
|
|
xxperm vs26, vs24, permute_mask
|
|
xxperm vs30, vs28, permute_mask
|
|
.endif
|
|
xvmaddasp vs38, vs2,vs25
|
|
xvmaddasp vs39, vs3,vs25
|
|
xvmaddasp vs54, vs2,vs29
|
|
xvmaddasp vs55, vs3,vs29
|
|
.if \Complete==0
|
|
xxpermdi vs25, vs24, vs24,2
|
|
xxpermdi vs29, vs28, vs28,2
|
|
.endif
|
|
xvmaddasp vs46, vs2,vs27
|
|
xvmaddasp vs47, vs3,vs27
|
|
xvmaddasp vs62, vs2,vs31
|
|
xvmaddasp vs63, vs3,vs31
|
|
.if \Complete==0
|
|
xxpermdi vs27, vs26, vs26,2
|
|
xxpermdi vs31, vs30, vs30,2
|
|
.endif
|
|
.if \Complete==0
|
|
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
|
|
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
|
|
.endif
|
|
|
|
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \BREG, \BREG, DISP16(\Index,\OffsetB)
|
|
addi \AREG, \AREG, DISP32(\Index,\OffsetA)
|
|
|
|
.else
|
|
addi \BREG, \BREG, DISP16(\Index,64)
|
|
addi \AREG, \AREG, DISP32(\Index,128)
|
|
|
|
.endif
|
|
.endif
|
|
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE8x16
|
|
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
|
|
add T4, T2, T10
|
|
add T5, T3, T10
|
|
|
|
add T6, T4, T10
|
|
add T7, T5, T10
|
|
|
|
|
|
|
|
/* permute to restore butterfly rank 1 updateto normal promoted one */
|
|
/* permute 16 vs8 MEM(CO) vs9 MEM(CO+LDC) vs10 MEM(CO+2*LDC) vs11 MEM(CO+3*LDC) */
|
|
/* permute 16 vs12 MEM(16+CO) vs13 MEM(16+CO+LDC) vs14 MEM(16+CO+2*LDC) vs15 MEM(16+CO+3*LDC) */
|
|
/* permute 16 vs16 MEM(32+CO) vs17 MEM(32+CO+LDC) vs18 MEM(32+CO+2*LDC) vs19 MEM(32+CO+3*LDC) */
|
|
/* permute 16 vs24 MEM(32+CO) vs25 MEM(32+CO+LDC) vs26 MEM(32+CO+2*LDC) vs27 MEM(32+CO+3*LDC) */
|
|
|
|
xxmrglw vs8, vs32, vs44
|
|
xxmrglw vs10, vs36, vs40
|
|
|
|
xxmrghw vs1, vs32, vs44
|
|
xxmrghw vs0, vs36, vs40
|
|
|
|
xxmrglw vs12, vs33, vs45
|
|
xxmrglw vs14, vs37, vs41
|
|
|
|
xxmrghw vs2, vs37, vs41
|
|
xxmrghw vs3, vs33, vs45
|
|
#ifndef TRMMKERNEL
|
|
lxv vs32, 0(CO)
|
|
lxv vs33, 16(CO)
|
|
#endif
|
|
xxmrglw vs16, vs34, vs46
|
|
xxmrglw vs18, vs38, vs42
|
|
|
|
xxlor vs9, vs8, vs8
|
|
xxlor vs11, vs10, vs10
|
|
|
|
xxmrghw vs4, vs38, vs42
|
|
xxmrghw vs5, vs34, vs46
|
|
|
|
xxlor vs13, vs12, vs12
|
|
xxlor vs15, vs14, vs14
|
|
|
|
xxmrglw vs24, vs35, vs47
|
|
xxmrglw vs26, vs39, vs43
|
|
|
|
xxlor vs17, vs16, vs16
|
|
xxlor vs19, vs18, vs18
|
|
|
|
xxmrghw vs30, vs39, vs43
|
|
xxmrghw vs31, vs35, vs47
|
|
#ifndef TRMMKERNEL
|
|
lxv vs34, 32(CO)
|
|
lxv vs35, 48(CO)
|
|
#endif
|
|
xxperm vs8, vs0, save_permute_1
|
|
xxperm vs10, vs1, save_permute_1
|
|
#ifndef TRMMKERNEL
|
|
lxv vs36, 0(T1)
|
|
lxv vs37, 16(T1)
|
|
#endif
|
|
xxperm vs9, vs0, save_permute_2
|
|
xxperm vs11, vs1, save_permute_2
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs38, 32(T1)
|
|
lxv vs39, 48(T1)
|
|
#endif
|
|
|
|
xxlor vs25, vs24, vs24
|
|
xxlor vs27, vs26, vs26
|
|
|
|
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs40, 0(T2)
|
|
lxv vs41, 16(T2)
|
|
#endif
|
|
|
|
xxperm vs12, vs2, save_permute_1
|
|
xxperm vs14, vs3, save_permute_1
|
|
#ifndef TRMMKERNEL
|
|
lxv vs42, 32(T2)
|
|
lxv vs43, 48(T2)
|
|
#endif
|
|
|
|
xxperm vs13, vs2, save_permute_2
|
|
xxperm vs15, vs3, save_permute_2
|
|
#ifndef TRMMKERNEL
|
|
lxv vs44, 0(T3)
|
|
lxv vs45, 16(T3)
|
|
#endif
|
|
xxperm vs16, vs4, save_permute_1
|
|
xxperm vs18, vs5, save_permute_1
|
|
#ifndef TRMMKERNEL
|
|
lxv vs46, 32(T3)
|
|
lxv vs47, 48(T3)
|
|
#endif
|
|
|
|
|
|
|
|
|
|
|
|
xxperm vs17, vs4, save_permute_2
|
|
xxperm vs19, vs5, save_permute_2
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs32, vs8, alpha_r
|
|
xvmulsp vs33, vs12, alpha_r
|
|
#else
|
|
xvmaddasp vs32, vs8, alpha_r
|
|
xvmaddasp vs33, vs12, alpha_r
|
|
#endif
|
|
xxperm vs24, vs30, save_permute_1
|
|
xxperm vs26, vs31, save_permute_1
|
|
|
|
|
|
stxv vs32, 0(CO)
|
|
stxv vs33, 16(CO)
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs34, vs16, alpha_r
|
|
xvmulsp vs35, vs24, alpha_r
|
|
#else
|
|
xvmaddasp vs34, vs16, alpha_r
|
|
xvmaddasp vs35, vs24, alpha_r
|
|
#endif
|
|
|
|
xxperm vs25, vs30, save_permute_2
|
|
xxperm vs27, vs31, save_permute_2
|
|
|
|
|
|
stxv vs34, 32(CO)
|
|
stxv vs35, 48(CO)
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs36, vs9, alpha_r
|
|
xvmulsp vs37, vs13, alpha_r
|
|
#else
|
|
xvmaddasp vs36, vs9, alpha_r
|
|
xvmaddasp vs37, vs13, alpha_r
|
|
#endif
|
|
stxv vs36, 0(T1)
|
|
stxv vs37, 16(T1)
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs38, vs17, alpha_r
|
|
xvmulsp vs39, vs25, alpha_r
|
|
#else
|
|
xvmaddasp vs38, vs17, alpha_r
|
|
xvmaddasp vs39, vs25, alpha_r
|
|
#endif
|
|
stxv vs38, 32(T1)
|
|
stxv vs39, 48(T1)
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs40, vs10, alpha_r
|
|
xvmulsp vs41, vs14, alpha_r
|
|
#else
|
|
xvmaddasp vs40, vs10, alpha_r
|
|
xvmaddasp vs41, vs14, alpha_r
|
|
#endif
|
|
|
|
stxv vs40, 0(T2)
|
|
stxv vs41, 16(T2)
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs42, vs18, alpha_r
|
|
xvmulsp vs43, vs26, alpha_r
|
|
#else
|
|
xvmaddasp vs42, vs18, alpha_r
|
|
xvmaddasp vs43, vs26, alpha_r
|
|
#endif
|
|
stxv vs42, 32(T2)
|
|
stxv vs43, 48(T2)
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs44, vs11, alpha_r
|
|
xvmulsp vs45, vs15, alpha_r
|
|
#else
|
|
xvmaddasp vs44, vs11, alpha_r
|
|
xvmaddasp vs45, vs15, alpha_r
|
|
#endif
|
|
stxv vs44, 0(T3)
|
|
stxv vs45, 16(T3)
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs46, vs19, alpha_r
|
|
xvmulsp vs47, vs27, alpha_r
|
|
#else
|
|
xvmaddasp vs46, vs19, alpha_r
|
|
xvmaddasp vs47, vs27, alpha_r
|
|
#endif
|
|
stxv vs46, 32(T3)
|
|
stxv vs47, 48(T3)
|
|
|
|
/*****the same with the second 8X8 ****/
|
|
#ifndef TRMMKERNEL
|
|
lxv vs32, 0(T4)
|
|
lxv vs33, 16(T4)
|
|
#endif
|
|
xxmrglw vs8, vs48, vs60
|
|
xxmrglw vs10, vs52, vs56
|
|
#ifndef TRMMKERNEL
|
|
lxv vs34, 32(T4)
|
|
lxv vs35, 48(T4)
|
|
#endif
|
|
xxmrghw vs1, vs48, vs60
|
|
xxmrghw vs0, vs52, vs56
|
|
#ifndef TRMMKERNEL
|
|
lxv vs36, 0(T5)
|
|
lxv vs37, 16(T5)
|
|
#endif
|
|
xxmrglw vs12, vs49, vs61
|
|
xxmrglw vs14, vs53, vs57
|
|
#ifndef TRMMKERNEL
|
|
lxv vs38,32(T5)
|
|
lxv vs39, 48(T5)
|
|
#endif
|
|
|
|
xxmrghw vs2, vs53, vs57
|
|
xxmrghw vs3, vs49, vs61
|
|
#ifndef TRMMKERNEL
|
|
lxv vs40, 0(T6)
|
|
lxv vs41, 16(T6)
|
|
#endif
|
|
xxmrglw vs16, vs50, vs62
|
|
xxmrglw vs18, vs54, vs58
|
|
#ifndef TRMMKERNEL
|
|
lxv vs42, 32(T6)
|
|
lxv vs43, 48(T6)
|
|
#endif
|
|
xxlor vs9, vs8, vs8
|
|
xxlor vs11, vs10, vs10
|
|
xxmrghw vs4, vs54, vs58
|
|
xxmrghw vs5, vs50, vs62
|
|
#ifndef TRMMKERNEL
|
|
lxv vs44, 0(T7)
|
|
lxv vs45, 16(T7)
|
|
#endif
|
|
xxlor vs13, vs12, vs12
|
|
xxlor vs15, vs14, vs14
|
|
|
|
xxmrglw vs24, vs51, vs63
|
|
xxmrglw vs26, vs55, vs59
|
|
#ifndef TRMMKERNEL
|
|
lxv vs46, 32(T7)
|
|
lxv vs47, 48(T7)
|
|
#endif
|
|
xxlor vs17, vs16, vs16
|
|
xxlor vs19, vs18, vs18
|
|
xxmrghw vs30, vs55, vs59
|
|
xxmrghw vs31, vs51, vs63
|
|
|
|
|
|
|
|
xxperm vs8, vs0, save_permute_1
|
|
xxperm vs10, vs1, save_permute_1
|
|
|
|
xxperm vs9, vs0, save_permute_2
|
|
xxperm vs11, vs1, save_permute_2
|
|
|
|
xxlor vs25, vs24, vs24
|
|
xxlor vs27, vs26, vs26
|
|
xxperm vs12, vs2, save_permute_1
|
|
xxperm vs14, vs3, save_permute_1
|
|
|
|
xxperm vs13, vs2, save_permute_2
|
|
xxperm vs15, vs3, save_permute_2
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs32, vs8, alpha_r
|
|
xvmulsp vs33, vs12, alpha_r
|
|
#else
|
|
xvmaddasp vs32, vs8, alpha_r
|
|
xvmaddasp vs33, vs12, alpha_r
|
|
#endif
|
|
xxperm vs16, vs4, save_permute_1
|
|
xxperm vs18, vs5, save_permute_1
|
|
stxv vs32, 0(T4)
|
|
stxv vs33, 16(T4)
|
|
xxperm vs17, vs4, save_permute_2
|
|
xxperm vs19, vs5, save_permute_2
|
|
xxperm vs24, vs30, save_permute_1
|
|
xxperm vs26, vs31, save_permute_1
|
|
xxperm vs25, vs30, save_permute_2
|
|
xxperm vs27, vs31, save_permute_2
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs34, vs16, alpha_r
|
|
xvmulsp vs35, vs24, alpha_r
|
|
#else
|
|
xvmaddasp vs34, vs16, alpha_r
|
|
xvmaddasp vs35, vs24, alpha_r
|
|
#endif
|
|
stxv vs34, 32(T4)
|
|
stxv vs35, 48(T4)
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs36, vs9, alpha_r
|
|
xvmulsp vs37, vs13, alpha_r
|
|
#else
|
|
xvmaddasp vs36, vs9, alpha_r
|
|
xvmaddasp vs37, vs13, alpha_r
|
|
#endif
|
|
stxv vs36, 0(T5)
|
|
stxv vs37, 16(T5)
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs38, vs17, alpha_r
|
|
xvmulsp vs39, vs25, alpha_r
|
|
#else
|
|
xvmaddasp vs38, vs17, alpha_r
|
|
xvmaddasp vs39, vs25, alpha_r
|
|
#endif
|
|
|
|
|
|
|
|
|
|
stxv vs38, 32(T5)
|
|
stxv vs39, 48(T5)
|
|
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs40, vs10, alpha_r
|
|
xvmulsp vs41, vs14, alpha_r
|
|
#else
|
|
xvmaddasp vs40, vs10, alpha_r
|
|
xvmaddasp vs41, vs14, alpha_r
|
|
#endif
|
|
stxv vs40, 0(T6)
|
|
stxv vs41, 16(T6)
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs42, vs18, alpha_r
|
|
xvmulsp vs43, vs26, alpha_r
|
|
#else
|
|
xvmaddasp vs42, vs18, alpha_r
|
|
xvmaddasp vs43, vs26, alpha_r
|
|
#endif
|
|
stxv vs42, 32(T6)
|
|
stxv vs43, 48(T6)
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs44, vs11, alpha_r
|
|
xvmulsp vs45, vs15, alpha_r
|
|
#else
|
|
xvmaddasp vs44, vs11, alpha_r
|
|
xvmaddasp vs45, vs15, alpha_r
|
|
#endif
|
|
|
|
stxv vs44, 0(T7)
|
|
stxv vs45, 16(T7)
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs46, vs19, alpha_r
|
|
xvmulsp vs47, vs27, alpha_r
|
|
#else
|
|
xvmaddasp vs46, vs19, alpha_r
|
|
xvmaddasp vs47, vs27, alpha_r
|
|
#endif
|
|
|
|
stxv vs46, 32(T7)
|
|
stxv vs47, 48(T7)
|
|
|
|
|
|
addi CO,CO,64
|
|
|
|
|
|
.endm
|
|
|
|
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=8 and M=8
|
|
**********************************************************************************************/
|
|
|
|
.macro LOAD8x8_1
|
|
LOAD8x8 1
|
|
.endm
|
|
|
|
.macro LOAD8x8_0
|
|
LOAD8x8 0
|
|
.endm
|
|
|
|
.macro KERNEL8x8_L1_L4 Index,IsLast
|
|
KERNEL8x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x8_I1_L4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
.macro KERNEL8x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro KERNEL8x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro END8x8_NORMAL
|
|
END8x8 0, AO, BO, 32,32
|
|
.endm
|
|
|
|
.macro Zero8X8
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
|
|
xxlxor vs36, vs36, vs36
|
|
xxlxor vs37, vs37, vs37
|
|
|
|
xxlxor vs40, vs40, vs40
|
|
xxlxor vs41, vs41, vs41
|
|
|
|
xxlxor vs44, vs44, vs44
|
|
xxlxor vs45, vs45, vs45
|
|
|
|
xxlxor vs48, vs48, vs48
|
|
xxlxor vs49, vs49, vs49
|
|
|
|
xxlxor vs52, vs52, vs52
|
|
xxlxor vs53, vs53, vs53
|
|
|
|
xxlxor vs56, vs56, vs56
|
|
xxlxor vs57, vs57, vs57
|
|
|
|
xxlxor vs60, vs60, vs60
|
|
xxlxor vs61, vs61, vs61
|
|
|
|
.endm
|
|
|
|
.macro LOAD8x8 Zero
|
|
|
|
lxv vs24, 0(BO)
|
|
lxv vs28, 16(BO)
|
|
lxv vs0, 0(AO)
|
|
lxv vs1, 16(AO)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxperm vs30, vs28, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
xxpermdi vs29, vs28, vs28,2
|
|
|
|
xxpermdi vs27, vs26, vs26,2
|
|
xxpermdi vs31, vs30, vs30,2
|
|
|
|
.if \Zero==1
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
xxlxor vs36, vs36, vs36
|
|
xxlxor vs37, vs37, vs37
|
|
xxlxor vs40, vs40, vs40
|
|
xxlxor vs41, vs41, vs41
|
|
xxlxor vs44, vs44, vs44
|
|
xxlxor vs45, vs45, vs45
|
|
xxlxor vs48, vs48, vs48
|
|
xxlxor vs49, vs49, vs49
|
|
xxlxor vs52, vs52, vs52
|
|
xxlxor vs53, vs53, vs53
|
|
xxlxor vs56, vs56, vs56
|
|
xxlxor vs57, vs57, vs57
|
|
xxlxor vs60, vs60, vs60
|
|
xxlxor vs61, vs61, vs61
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro END8x8 First, AREG, BREG, OffsetA, OffsetB
|
|
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs0,vs24
|
|
xvmulsp vs33, vs1,vs24
|
|
|
|
xvmulsp vs36, vs0,vs25
|
|
xvmulsp vs37, vs1,vs25
|
|
|
|
xvmulsp vs40, vs0,vs26
|
|
xvmulsp vs41, vs1,vs26
|
|
|
|
xvmulsp vs44, vs0,vs27
|
|
xvmulsp vs45, vs1,vs27
|
|
|
|
xvmulsp vs48, vs0,vs28
|
|
xvmulsp vs49, vs1,vs28
|
|
|
|
xvmulsp vs52, vs0,vs29
|
|
xvmulsp vs53, vs1,vs29
|
|
|
|
xvmulsp vs56, vs0,vs30
|
|
xvmulsp vs57, vs1,vs30
|
|
|
|
xvmulsp vs60, vs0,vs31
|
|
xvmulsp vs61, vs1,vs31
|
|
|
|
.else
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
|
|
xvmaddasp vs48, vs0,vs28
|
|
xvmaddasp vs49, vs1,vs28
|
|
|
|
xvmaddasp vs52, vs0,vs29
|
|
xvmaddasp vs53, vs1,vs29
|
|
|
|
xvmaddasp vs56, vs0,vs30
|
|
xvmaddasp vs57, vs1,vs30
|
|
|
|
xvmaddasp vs60, vs0,vs31
|
|
xvmaddasp vs61, vs1,vs31
|
|
|
|
.endif
|
|
.endm
|
|
|
|
.macro KERNEL8x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs8, DISP32(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs12, DISP32(\Index,16+\OffsetB)(\BREG)
|
|
|
|
lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
|
|
|
|
xxperm vs10, vs8, permute_mask
|
|
xxperm vs14, vs12, permute_mask
|
|
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
|
|
xxpermdi vs9, vs8, vs8,2
|
|
xxpermdi vs13, vs12, vs12,2
|
|
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
|
|
xxpermdi vs11, vs10, vs10,2
|
|
xxpermdi vs15, vs14, vs14,2
|
|
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
|
|
xvmaddasp vs48, vs0,vs28
|
|
xvmaddasp vs49, vs1,vs28
|
|
|
|
xvmaddasp vs52, vs0,vs29
|
|
xvmaddasp vs53, vs1,vs29
|
|
lxv vs24, DISP32(\Index,32+\OffsetB)(\BREG)
|
|
lxv vs28, DISP32(\Index,32+16+\OffsetB)(\BREG)
|
|
xvmaddasp vs56, vs0,vs30
|
|
xvmaddasp vs57, vs1,vs30
|
|
|
|
xvmaddasp vs60, vs0,vs31
|
|
xvmaddasp vs61, vs1,vs31
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxperm vs30, vs28, permute_mask
|
|
|
|
lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
|
|
|
|
|
|
xxpermdi vs25, vs24, vs24,2
|
|
xxpermdi vs29, vs28, vs28,2
|
|
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
|
|
xxpermdi vs27, vs26, vs26,2
|
|
xxpermdi vs31, vs30, vs30,2
|
|
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
|
|
xvmaddasp vs48, vs4,vs12
|
|
xvmaddasp vs49, vs5,vs12
|
|
|
|
xvmaddasp vs52, vs4,vs13
|
|
xvmaddasp vs53, vs5,vs13
|
|
lxv vs8, DISP32(\Index,64+\OffsetB)(\BREG)
|
|
lxv vs12, DISP32(\Index,64+16+\OffsetB)(\BREG)
|
|
xvmaddasp vs56, vs4,vs14
|
|
xvmaddasp vs57, vs5,vs14
|
|
|
|
xvmaddasp vs60, vs4,vs15
|
|
xvmaddasp vs61, vs5,vs15
|
|
|
|
xxperm vs10, vs8, permute_mask
|
|
xxperm vs14, vs12, permute_mask
|
|
|
|
|
|
lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
|
|
|
|
|
|
xxpermdi vs9, vs8, vs8,2
|
|
xxpermdi vs13, vs12, vs12,2
|
|
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
|
|
xxpermdi vs11, vs10, vs10,2
|
|
xxpermdi vs15, vs14, vs14,2
|
|
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
|
|
xvmaddasp vs48, vs0,vs28
|
|
xvmaddasp vs49, vs1,vs28
|
|
|
|
xvmaddasp vs52, vs0,vs29
|
|
xvmaddasp vs53, vs1,vs29
|
|
.if \Complete==0
|
|
lxv vs24, DISP32(\Index,96+\OffsetB)(\BREG)
|
|
lxv vs28, DISP32(\Index,96+16+\OffsetB)(\BREG)
|
|
.endif
|
|
xvmaddasp vs56, vs0,vs30
|
|
xvmaddasp vs57, vs1,vs30
|
|
.if \Complete==0
|
|
xxperm vs26, vs24, permute_mask
|
|
xxperm vs30, vs28, permute_mask
|
|
.endif
|
|
xvmaddasp vs60, vs0,vs31
|
|
xvmaddasp vs61, vs1,vs31
|
|
|
|
|
|
.if \Complete==0
|
|
lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
|
|
lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
xxpermdi vs25, vs24, vs24,2
|
|
xxpermdi vs29, vs28, vs28,2
|
|
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
|
|
addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
|
|
addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
|
|
.else
|
|
|
|
addi \BREG, \BREG, DISP32(\Index,128)
|
|
addi \AREG, \AREG, DISP32(\Index,128)
|
|
.endif
|
|
.endif
|
|
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
|
|
.if \Complete==0
|
|
xxpermdi vs27, vs26, vs26,2
|
|
xxpermdi vs31, vs30, vs30,2
|
|
|
|
.endif
|
|
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
|
|
xvmaddasp vs48, vs4,vs12
|
|
xvmaddasp vs49, vs5,vs12
|
|
|
|
xvmaddasp vs52, vs4,vs13
|
|
xvmaddasp vs53, vs5,vs13
|
|
|
|
xvmaddasp vs56, vs4,vs14
|
|
xvmaddasp vs57, vs5,vs14
|
|
|
|
xvmaddasp vs60, vs4,vs15
|
|
xvmaddasp vs61, vs5,vs15
|
|
|
|
.endm
|
|
|
|
.macro KERNEL8x8 First
|
|
|
|
LOAD8x8 0
|
|
END8x8 \First, AO, BO, 32,32
|
|
.endm
|
|
|
|
.macro KERNEL8x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs12, DISP16(\Index,16+\OffsetB)(\BREG)
|
|
|
|
lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
|
|
|
|
xxperm vs10, vs8, permute_mask
|
|
xxperm vs14, vs12, permute_mask
|
|
xxpermdi vs9, vs8, vs8,2
|
|
xxpermdi vs13, vs12, vs12,2
|
|
.if \First==1
|
|
xvmulsp vs32, vs0,vs24
|
|
xvmulsp vs33, vs1,vs24
|
|
|
|
xvmulsp vs36, vs0,vs25
|
|
xvmulsp vs37, vs1,vs25
|
|
|
|
.else
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
|
|
.endif
|
|
|
|
xxpermdi vs11, vs10, vs10,2
|
|
xxpermdi vs15, vs14, vs14,2
|
|
|
|
.if \First==1
|
|
xvmulsp vs40, vs0,vs26
|
|
xvmulsp vs41, vs1,vs26
|
|
|
|
xvmulsp vs44, vs0,vs27
|
|
xvmulsp vs45, vs1,vs27
|
|
|
|
xvmulsp vs48, vs0,vs28
|
|
xvmulsp vs49, vs1,vs28
|
|
|
|
xvmulsp vs52, vs0,vs29
|
|
xvmulsp vs53, vs1,vs29
|
|
|
|
xvmulsp vs56, vs0,vs30
|
|
xvmulsp vs57, vs1,vs30
|
|
|
|
xvmulsp vs60, vs0,vs31
|
|
xvmulsp vs61, vs1,vs31
|
|
|
|
.else
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
|
|
xvmaddasp vs48, vs0,vs28
|
|
xvmaddasp vs49, vs1,vs28
|
|
|
|
xvmaddasp vs52, vs0,vs29
|
|
xvmaddasp vs53, vs1,vs29
|
|
|
|
xvmaddasp vs56, vs0,vs30
|
|
xvmaddasp vs57, vs1,vs30
|
|
|
|
xvmaddasp vs60, vs0,vs31
|
|
xvmaddasp vs61, vs1,vs31
|
|
|
|
.endif
|
|
.if \Complete==0
|
|
lxv vs24, DISP16(\Index,32+\OffsetB)(\BREG)
|
|
lxv vs28, DISP16(\Index,32+16+\OffsetB)(\BREG)
|
|
|
|
lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxperm vs30, vs28, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
xxpermdi vs29, vs28, vs28,2
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
|
|
addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
|
|
|
|
.else
|
|
addi \BREG, \BREG, DISP16(\Index,64)
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
.endif
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs4,vs8
|
|
xvmulsp vs33, vs5,vs8
|
|
|
|
xvmulsp vs36, vs4,vs9
|
|
xvmulsp vs37, vs5,vs9
|
|
|
|
.else
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
xxpermdi vs27, vs26, vs26,2
|
|
xxpermdi vs31, vs30, vs30,2
|
|
|
|
.endif
|
|
.if \First==1
|
|
xvmulsp vs40, vs4,vs10
|
|
xvmulsp vs41, vs5,vs10
|
|
|
|
xvmulsp vs44, vs4,vs11
|
|
xvmulsp vs45, vs5,vs11
|
|
|
|
xvmulsp vs48, vs4,vs12
|
|
xvmulsp vs49, vs5,vs12
|
|
|
|
xvmulsp vs52, vs4,vs13
|
|
xvmulsp vs53, vs5,vs13
|
|
|
|
xvmulsp vs56, vs4,vs14
|
|
xvmulsp vs57, vs5,vs14
|
|
|
|
xvmulsp vs60, vs4,vs15
|
|
xvmulsp vs61, vs5,vs15
|
|
|
|
.else
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
|
|
xvmaddasp vs48, vs4,vs12
|
|
xvmaddasp vs49, vs5,vs12
|
|
|
|
xvmaddasp vs52, vs4,vs13
|
|
xvmaddasp vs53, vs5,vs13
|
|
|
|
xvmaddasp vs56, vs4,vs14
|
|
xvmaddasp vs57, vs5,vs14
|
|
|
|
xvmaddasp vs60, vs4,vs15
|
|
xvmaddasp vs61, vs5,vs15
|
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE8x8
|
|
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
|
|
add T4, T2, T10
|
|
add T5, T3, T10
|
|
|
|
add T6, T4, T10
|
|
add T7, T5, T10
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs34, 0(CO)
|
|
lxv vs35, 16(CO)
|
|
lxv vs38, 0(T1)
|
|
lxv vs39, 16(T1)
|
|
lxv vs42, 0(T2)
|
|
lxv vs43, 16(T2)
|
|
lxv vs46, 0(T3)
|
|
lxv vs47, 16(T3)
|
|
|
|
lxv vs50, 0(T4)
|
|
lxv vs51, 16(T4)
|
|
lxv vs54, 0(T5)
|
|
lxv vs55, 16(T5)
|
|
lxv vs58, 0(T6)
|
|
lxv vs59, 16(T6)
|
|
lxv vs62, 0(T7)
|
|
lxv vs63, 16(T7)
|
|
#endif
|
|
|
|
xxmrglw vs8, vs32, vs44
|
|
xxmrglw vs10, vs36, vs40
|
|
|
|
xxmrghw vs1, vs32, vs44
|
|
xxmrghw vs0, vs36, vs40
|
|
|
|
xxmrglw vs12, vs33, vs45
|
|
xxmrglw vs14, vs37, vs41
|
|
|
|
xxmrghw vs2, vs37, vs41
|
|
xxmrghw vs3, vs33, vs45
|
|
|
|
xxlor vs9, vs8, vs8
|
|
xxlor vs11, vs10, vs10
|
|
|
|
xxlor vs13, vs12, vs12
|
|
xxlor vs15, vs14, vs14
|
|
|
|
xxperm vs8, vs0, save_permute_1
|
|
xxperm vs10, vs1, save_permute_1
|
|
xxperm vs9, vs0, save_permute_2
|
|
xxperm vs11, vs1, save_permute_2
|
|
|
|
xxperm vs12, vs2, save_permute_1
|
|
xxperm vs14, vs3, save_permute_1
|
|
|
|
xxperm vs13, vs2, save_permute_2
|
|
xxperm vs15, vs3, save_permute_2
|
|
|
|
|
|
/* multiply add normal way */
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs34, vs8, alpha_r
|
|
xvmulsp vs35, vs12, alpha_r
|
|
xvmulsp vs38, vs9, alpha_r
|
|
xvmulsp vs39, vs13, alpha_r
|
|
xvmulsp vs42, vs10, alpha_r
|
|
xvmulsp vs43, vs14, alpha_r
|
|
xvmulsp vs46, vs11, alpha_r
|
|
xvmulsp vs47, vs15, alpha_r
|
|
#else
|
|
xvmaddasp vs34, vs8, alpha_r
|
|
xvmaddasp vs35, vs12, alpha_r
|
|
xvmaddasp vs38, vs9, alpha_r
|
|
xvmaddasp vs39, vs13, alpha_r
|
|
xvmaddasp vs42, vs10, alpha_r
|
|
xvmaddasp vs43, vs14, alpha_r
|
|
xvmaddasp vs46, vs11, alpha_r
|
|
xvmaddasp vs47, vs15, alpha_r
|
|
#endif
|
|
|
|
|
|
xxmrglw vs8, vs48, vs60
|
|
xxmrglw vs10, vs52, vs56
|
|
|
|
xxmrghw vs1, vs48, vs60
|
|
xxmrghw vs0, vs52, vs56
|
|
stxv vs34, 0(CO)
|
|
stxv vs35, 16(CO)
|
|
xxmrglw vs12, vs49, vs61
|
|
xxmrglw vs14, vs53, vs57
|
|
stxv vs38, 0(T1)
|
|
stxv vs39, 16(T1)
|
|
xxmrghw vs2, vs53, vs57
|
|
xxmrghw vs3, vs49, vs61
|
|
stxv vs42, 0(T2)
|
|
stxv vs43, 16(T2)
|
|
xxlor vs9, vs8, vs8
|
|
xxlor vs11, vs10, vs10
|
|
stxv vs46, 0(T3)
|
|
stxv vs47, 16(T3)
|
|
xxlor vs13, vs12, vs12
|
|
xxlor vs15, vs14, vs14
|
|
|
|
xxperm vs8, vs0, save_permute_1
|
|
xxperm vs10, vs1, save_permute_1
|
|
|
|
|
|
xxperm vs9, vs0, save_permute_2
|
|
xxperm vs11, vs1, save_permute_2
|
|
|
|
xxperm vs12, vs2, save_permute_1
|
|
xxperm vs14, vs3, save_permute_1
|
|
xxperm vs13, vs2, save_permute_2
|
|
xxperm vs15, vs3, save_permute_2
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs50, vs8, alpha_r
|
|
xvmulsp vs51, vs12, alpha_r
|
|
xvmulsp vs54, vs9, alpha_r
|
|
xvmulsp vs55, vs13, alpha_r
|
|
xvmulsp vs58, vs10, alpha_r
|
|
xvmulsp vs59, vs14, alpha_r
|
|
xvmulsp vs62, vs11, alpha_r
|
|
xvmulsp vs63, vs15, alpha_r
|
|
#else
|
|
xvmaddasp vs50, vs8, alpha_r
|
|
xvmaddasp vs51, vs12, alpha_r
|
|
xvmaddasp vs54, vs9, alpha_r
|
|
xvmaddasp vs55, vs13, alpha_r
|
|
xvmaddasp vs58, vs10, alpha_r
|
|
xvmaddasp vs59, vs14, alpha_r
|
|
xvmaddasp vs62, vs11, alpha_r
|
|
xvmaddasp vs63, vs15, alpha_r
|
|
#endif
|
|
|
|
stxv vs50, 0(T4)
|
|
stxv vs51, 16(T4)
|
|
stxv vs54, 0(T5)
|
|
stxv vs55, 16(T5)
|
|
stxv vs58, 0(T6)
|
|
stxv vs59, 16(T6)
|
|
stxv vs62, 0(T7)
|
|
stxv vs63, 16(T7)
|
|
|
|
addi CO,CO,32
|
|
|
|
.endm
|
|
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=8 and M=4
|
|
**********************************************************************************************/
|
|
|
|
.macro LOAD8x4_1
|
|
LOAD8x4 1
|
|
.endm
|
|
|
|
.macro LOAD8x4_0
|
|
LOAD8x4 0
|
|
.endm
|
|
|
|
.macro KERNEL8x4_L1_L4 Index,IsLast
|
|
KERNEL8x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x4_I1_L4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
.macro KERNEL8x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro KERNEL8x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL8x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro Zero8X4
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
xxlxor vs34, vs34, vs34
|
|
xxlxor vs35, vs35, vs35
|
|
|
|
xxlxor vs48, vs48, vs48
|
|
xxlxor vs49, vs49, vs49
|
|
xxlxor vs50, vs50, vs50
|
|
xxlxor vs51, vs51, vs51
|
|
|
|
.endm
|
|
|
|
.macro LOAD8x4 Zero
|
|
|
|
lxv vs0, 0(AO)
|
|
lxv vs24, 0(BO)
|
|
lxv vs25, 16(BO)
|
|
|
|
|
|
|
|
xxperm vs2, vs0, permute_mask
|
|
xxpermdi vs1, vs0, vs0,2
|
|
xxpermdi vs3, vs2, vs2,2
|
|
|
|
.if \Zero==1
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
xxlxor vs34, vs34, vs34
|
|
xxlxor vs35, vs35, vs35
|
|
|
|
xxlxor vs48, vs48, vs48
|
|
xxlxor vs49, vs49, vs49
|
|
xxlxor vs50, vs50, vs50
|
|
xxlxor vs51, vs51, vs51
|
|
.endif
|
|
.endm
|
|
|
|
.macro END8x4_NORMAL
|
|
END8x4 0, AO, BO, 16,32
|
|
.endm
|
|
|
|
.macro END8x4 First, AREG, BREG, OffsetA, OffsetB
|
|
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs24, vs0
|
|
xvmulsp vs33, vs24, vs1
|
|
xvmulsp vs34, vs24, vs2
|
|
xvmulsp vs35, vs24, vs3
|
|
|
|
xvmulsp vs48, vs25, vs0
|
|
xvmulsp vs49, vs25, vs1
|
|
xvmulsp vs50, vs25, vs2
|
|
xvmulsp vs51, vs25, vs3
|
|
.else
|
|
xvmaddasp vs32, vs24, vs0
|
|
xvmaddasp vs33, vs24, vs1
|
|
xvmaddasp vs34, vs24, vs2
|
|
xvmaddasp vs35, vs24, vs3
|
|
|
|
xvmaddasp vs48, vs25, vs0
|
|
xvmaddasp vs49, vs25, vs1
|
|
xvmaddasp vs50, vs25, vs2
|
|
xvmaddasp vs51, vs25, vs3
|
|
|
|
.endif
|
|
.endm
|
|
|
|
.macro KERNEL8x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
|
|
|
|
xxperm vs6, vs4, permute_mask
|
|
xxpermdi vs5, vs4, vs4,2
|
|
xxpermdi vs7, vs6, vs6,2
|
|
|
|
xvmaddasp vs32, vs24, vs0
|
|
xvmaddasp vs33, vs24, vs1
|
|
xvmaddasp vs34, vs24, vs2
|
|
xvmaddasp vs35, vs24, vs3
|
|
|
|
xvmaddasp vs48, vs25, vs0
|
|
xvmaddasp vs49, vs25, vs1
|
|
xvmaddasp vs50, vs25, vs2
|
|
xvmaddasp vs51, vs25, vs3
|
|
|
|
lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
|
|
lxv vs24, DISP32(\Index, 32+\OffsetB)(\BREG)
|
|
lxv vs25, DISP32(\Index, 48+\OffsetB)(\BREG)
|
|
|
|
xxperm vs2, vs0, permute_mask
|
|
xxpermdi vs1, vs0, vs0,2
|
|
xxpermdi vs3, vs2, vs2,2
|
|
|
|
xvmaddasp vs32, vs26, vs4
|
|
xvmaddasp vs33, vs26, vs5
|
|
xvmaddasp vs34, vs26, vs6
|
|
xvmaddasp vs35, vs26, vs7
|
|
|
|
xvmaddasp vs48, vs27, vs4
|
|
xvmaddasp vs49, vs27, vs5
|
|
xvmaddasp vs50, vs27, vs6
|
|
xvmaddasp vs51, vs27, vs7
|
|
|
|
|
|
lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
|
|
lxv vs26, DISP32(\Index, 64+\OffsetB)(\BREG)
|
|
lxv vs27, DISP32(\Index, 80+\OffsetB)(\BREG)
|
|
|
|
xxperm vs6, vs4, permute_mask
|
|
xxpermdi vs5, vs4, vs4,2
|
|
xxpermdi vs7, vs6, vs6,2
|
|
|
|
xvmaddasp vs32, vs24, vs0
|
|
xvmaddasp vs33, vs24, vs1
|
|
xvmaddasp vs34, vs24, vs2
|
|
xvmaddasp vs35, vs24, vs3
|
|
|
|
xvmaddasp vs48, vs25, vs0
|
|
xvmaddasp vs49, vs25, vs1
|
|
xvmaddasp vs50, vs25, vs2
|
|
xvmaddasp vs51, vs25, vs3
|
|
|
|
.if \Complete==0
|
|
|
|
lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
|
|
lxv vs24, DISP32(\Index, 96+\OffsetB)(\BREG)
|
|
lxv vs25, DISP32(\Index, 96+16+\OffsetB)(\BREG)
|
|
|
|
xxperm vs2, vs0, permute_mask
|
|
xxpermdi vs1, vs0, vs0,2
|
|
xxpermdi vs3, vs2, vs2,2
|
|
.endif
|
|
xvmaddasp vs32, vs26, vs4
|
|
xvmaddasp vs33, vs26, vs5
|
|
xvmaddasp vs34, vs26, vs6
|
|
xvmaddasp vs35, vs26, vs7
|
|
|
|
xvmaddasp vs48, vs27, vs4
|
|
xvmaddasp vs49, vs27, vs5
|
|
xvmaddasp vs50, vs27, vs6
|
|
xvmaddasp vs51, vs27, vs7
|
|
|
|
|
|
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
|
|
addi \BREG, \BREG, DISP32(\Index,32*3+\OffsetB)
|
|
|
|
.else
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
addi \BREG, \BREG, DISP32(\Index,128)
|
|
|
|
.endif
|
|
.endif
|
|
|
|
|
|
.endm
|
|
|
|
.macro KERNEL8x4 First
|
|
LOAD8x4 0
|
|
END8x4 \First, AO, BO, 16,32
|
|
.endm
|
|
|
|
.macro KERNEL8x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
|
|
|
|
xxperm vs6, vs4, permute_mask
|
|
xxpermdi vs5, vs4, vs4,2
|
|
xxpermdi vs7, vs6, vs6,2
|
|
.if \First==1
|
|
xvmulsp vs32, vs24, vs0
|
|
xvmulsp vs33, vs24, vs1
|
|
xvmulsp vs34, vs24, vs2
|
|
xvmulsp vs35, vs24, vs3
|
|
|
|
xvmulsp vs48, vs25, vs0
|
|
xvmulsp vs49, vs25, vs1
|
|
xvmulsp vs50, vs25, vs2
|
|
xvmulsp vs51, vs25, vs3
|
|
.else
|
|
xvmaddasp vs32, vs24, vs0
|
|
xvmaddasp vs33, vs24, vs1
|
|
xvmaddasp vs34, vs24, vs2
|
|
xvmaddasp vs35, vs24, vs3
|
|
|
|
xvmaddasp vs48, vs25, vs0
|
|
xvmaddasp vs49, vs25, vs1
|
|
xvmaddasp vs50, vs25, vs2
|
|
xvmaddasp vs51, vs25, vs3
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
|
|
lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
|
|
lxv vs24, DISP16(\Index, 32+\OffsetB)(\BREG)
|
|
lxv vs25, DISP16(\Index, 48+\OffsetB)(\BREG)
|
|
|
|
xxperm vs2, vs0, permute_mask
|
|
xxpermdi vs1, vs0, vs0,2
|
|
xxpermdi vs3, vs2, vs2,2
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs26, vs4
|
|
xvmulsp vs33, vs26, vs5
|
|
xvmulsp vs34, vs26, vs6
|
|
xvmulsp vs35, vs26, vs7
|
|
|
|
xvmulsp vs48, vs27, vs4
|
|
xvmulsp vs49, vs27, vs5
|
|
xvmulsp vs50, vs27, vs6
|
|
xvmulsp vs51, vs27, vs7
|
|
|
|
|
|
.else
|
|
xvmaddasp vs32, vs26, vs4
|
|
xvmaddasp vs33, vs26, vs5
|
|
xvmaddasp vs34, vs26, vs6
|
|
xvmaddasp vs35, vs26, vs7
|
|
|
|
xvmaddasp vs48, vs27, vs4
|
|
xvmaddasp vs49, vs27, vs5
|
|
xvmaddasp vs50, vs27, vs6
|
|
xvmaddasp vs51, vs27, vs7
|
|
.endif
|
|
|
|
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
|
|
addi \BREG, \BREG, DISP16(\Index,32+\OffsetB)
|
|
|
|
.else
|
|
addi \AREG, \AREG, DISP8(\Index,32)
|
|
addi \BREG, \BREG, DISP16(\Index,64)
|
|
|
|
.endif
|
|
.endif
|
|
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE8x4
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
#if !defined(TRMMKERNEL)
|
|
lxv vs36, 0(CO)
|
|
lxv vs37, 0(T1)
|
|
#endif
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
#if !defined(TRMMKERNEL)
|
|
lxv vs38, 0(T2)
|
|
lxv vs39, 0(T3)
|
|
#endif
|
|
add T4, T2, T10
|
|
add T5, T3, T10
|
|
#if !defined(TRMMKERNEL)
|
|
lxv vs40, 0(T4)
|
|
lxv vs41, 0(T5)
|
|
#endif
|
|
add T6, T4, T10
|
|
add T7, T5, T10
|
|
#if !defined(TRMMKERNEL)
|
|
lxv vs42, 0(T6)
|
|
lxv vs43, 0(T7)
|
|
#endif
|
|
xxmrglw vs0, vs35,vs32
|
|
xxmrglw vs1, vs34,vs33
|
|
xxmrglw vs4, vs32,vs35
|
|
xxmrglw vs5, vs33,vs34
|
|
|
|
|
|
xxmrghw vs2, vs35,vs32
|
|
xxmrghw vs3, vs34,vs33
|
|
xxmrghw vs6, vs32,vs35
|
|
xxmrghw vs7, vs33,vs34
|
|
|
|
xxmrgld vs24, vs1, vs0
|
|
xxmrghd vs25,vs5,vs4
|
|
|
|
xxmrgld vs26, vs2, vs3
|
|
xxmrghd vs27,vs6,vs7
|
|
|
|
|
|
xxmrglw vs0, vs51,vs48
|
|
xxmrglw vs1, vs50,vs49
|
|
xxmrglw vs4, vs48,vs51
|
|
xxmrglw vs5, vs49,vs50
|
|
|
|
xxmrghw vs2, vs51,vs48
|
|
xxmrghw vs3, vs50,vs49
|
|
xxmrghw vs6, vs48,vs51
|
|
xxmrghw vs7, vs49,vs50
|
|
|
|
xxmrgld vs28, vs1, vs0
|
|
xxmrghd vs29,vs5,vs4
|
|
|
|
xxmrgld vs30, vs2, vs3
|
|
xxmrghd vs31,vs6,vs7
|
|
#if defined(TRMMKERNEL)
|
|
|
|
xvmulsp vs36, vs24, alpha_r
|
|
xvmulsp vs37, vs25, alpha_r
|
|
xvmulsp vs38, vs26, alpha_r
|
|
xvmulsp vs39, vs27, alpha_r
|
|
xvmulsp vs40, vs28, alpha_r
|
|
xvmulsp vs41, vs29, alpha_r
|
|
xvmulsp vs42, vs30, alpha_r
|
|
xvmulsp vs43, vs31, alpha_r
|
|
#else
|
|
xvmaddasp vs36, vs24, alpha_r
|
|
xvmaddasp vs37, vs25, alpha_r
|
|
xvmaddasp vs38, vs26, alpha_r
|
|
xvmaddasp vs39, vs27, alpha_r
|
|
xvmaddasp vs40, vs28, alpha_r
|
|
xvmaddasp vs41, vs29, alpha_r
|
|
xvmaddasp vs42, vs30, alpha_r
|
|
xvmaddasp vs43, vs31, alpha_r
|
|
#endif
|
|
|
|
stxv vs36, 0(CO)
|
|
stxv vs37, 0(T1)
|
|
stxv vs38, 0(T2)
|
|
stxv vs39, 0(T3)
|
|
stxv vs40, 0(T4)
|
|
stxv vs41, 0(T5)
|
|
stxv vs42, 0(T6)
|
|
stxv vs43, 0(T7)
|
|
|
|
|
|
addi CO,CO,16
|
|
.endm
|
|
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=8 and M=2
|
|
**********************************************************************************************/
|
|
|
|
|
|
.macro KERNEL8x2_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
|
|
|
|
.macro Zero8x2
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2, vs2, vs2
|
|
xxlxor vs3, vs3, vs3
|
|
|
|
.endm
|
|
|
|
.macro KERNEL8x2
|
|
KERNEL8x2_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL8x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs27, DISP8(\Index,16+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 0
|
|
xxspltw vs9, vs36, 1
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs27, vs8
|
|
xvmulsp vs2, vs26, vs9
|
|
xvmulsp vs3, vs27, vs9
|
|
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs2, vs26, vs9
|
|
xvmaddasp vs3, vs27, vs9
|
|
|
|
.endif
|
|
|
|
addi \AREG, \AREG, DISP2(\Index,8)
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
|
|
.endm
|
|
|
|
.macro KERNEL8x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs27, DISP16(\Index,16+\OffsetB)(\BREG)
|
|
lxv vs28, DISP16(\Index,32+\OffsetB)(\BREG)
|
|
lxv vs29, DISP16(\Index,48+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs4, 2
|
|
xxspltw vs9, vs4, 3
|
|
xxspltw vs10, vs4, 0
|
|
xxspltw vs11, vs4, 1
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs27, vs8
|
|
xvmulsp vs2, vs26, vs9
|
|
xvmulsp vs3, vs27, vs9
|
|
|
|
xvmulsp vs0, vs28, vs10
|
|
xvmulsp vs1, vs29, vs10
|
|
xvmulsp vs2, vs28, vs11
|
|
xvmulsp vs3, vs29, vs11
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs2, vs26, vs9
|
|
xvmaddasp vs3, vs27, vs9
|
|
|
|
xvmaddasp vs0, vs28, vs10
|
|
xvmaddasp vs1, vs29, vs10
|
|
xvmaddasp vs2, vs28, vs11
|
|
xvmaddasp vs3, vs29, vs11
|
|
.endif
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
addi \BREG, \BREG, DISP16(\Index,64)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE8x2
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
add T4, T2, T10
|
|
add T5, T3, T10
|
|
add T6, T4, T10
|
|
add T7, T5, T10
|
|
/*convert alpha_r for multiply*/
|
|
xscvspdp vs4,alpha_r
|
|
/* v0 corresponds to vs32, do not forget*/
|
|
#if !defined(TRMMKERNEL)
|
|
lxssp v0,0(CO)
|
|
lxssp v1,4(CO)
|
|
|
|
lxssp v2,0(T1)
|
|
lxssp v3,4(T1)
|
|
|
|
lxssp v4,0(T2)
|
|
lxssp v5,4(T2)
|
|
|
|
lxssp v6,0(T3)
|
|
lxssp v7,4(T3)
|
|
|
|
lxssp v8,0(T4)
|
|
lxssp v9,4(T4)
|
|
|
|
lxssp v10,0(T5)
|
|
lxssp v11,4(T5)
|
|
|
|
lxssp v12,0(T6)
|
|
lxssp v13,4(T6)
|
|
|
|
lxssp v14,0(T7)
|
|
lxssp v15,4(T7)
|
|
#endif
|
|
xscvspdp vs5, vs2
|
|
xxspltw vs6, vs2, 1
|
|
xxspltw vs7, vs2, 2
|
|
xxspltw vs8, vs2, 3
|
|
xscvspdp vs6,vs6
|
|
xscvspdp vs7,vs7
|
|
xscvspdp vs8,vs8
|
|
|
|
xscvspdp vs24, vs0
|
|
xxspltw vs25, vs0, 1
|
|
xxspltw vs26, vs0, 2
|
|
xxspltw vs27, vs0, 3
|
|
xscvspdp vs25,vs25
|
|
xscvspdp vs26,vs26
|
|
xscvspdp vs27,vs27
|
|
|
|
xscvspdp vs9, vs3
|
|
xxspltw vs10, vs3, 1
|
|
xxspltw vs11, vs3, 2
|
|
xxspltw vs12, vs3, 3
|
|
xscvspdp vs10,vs10
|
|
xscvspdp vs11,vs11
|
|
xscvspdp vs12,vs12
|
|
|
|
xscvspdp vs28, vs1
|
|
xxspltw vs29, vs1, 1
|
|
xxspltw vs30, vs1, 2
|
|
xxspltw vs31, vs1, 3
|
|
xscvspdp vs29,vs29
|
|
xscvspdp vs30,vs30
|
|
xscvspdp vs31,vs31
|
|
|
|
|
|
|
|
|
|
#if defined(TRMMKERNEL)
|
|
xsmuldp vs32,vs8, vs4
|
|
xsmuldp vs33,vs27, vs4
|
|
|
|
xsmuldp vs34,vs7, vs4
|
|
xsmuldp vs35,vs26, vs4
|
|
|
|
xsmuldp vs36,vs6, vs4
|
|
xsmuldp vs37,vs25, vs4
|
|
|
|
xsmuldp vs38,vs5, vs4
|
|
xsmuldp vs39,vs24, vs4
|
|
|
|
xsmuldp vs40,vs12, vs4
|
|
xsmuldp vs41,vs31, vs4
|
|
|
|
xsmuldp vs42,vs11, vs4
|
|
xsmuldp vs43,vs30, vs4
|
|
|
|
xsmuldp vs44,vs10, vs4
|
|
xsmuldp vs45,vs29, vs4
|
|
|
|
xsmuldp vs46,vs9, vs4
|
|
xsmuldp vs47,vs28, vs4
|
|
#else
|
|
xsmaddadp vs32,vs8, vs4
|
|
xsmaddadp vs33,vs27, vs4
|
|
|
|
xsmaddadp vs34,vs7, vs4
|
|
xsmaddadp vs35,vs26, vs4
|
|
|
|
xsmaddadp vs36,vs6, vs4
|
|
xsmaddadp vs37,vs25, vs4
|
|
|
|
xsmaddadp vs38,vs5, vs4
|
|
xsmaddadp vs39,vs24, vs4
|
|
|
|
xsmaddadp vs40,vs12, vs4
|
|
xsmaddadp vs41,vs31, vs4
|
|
|
|
xsmaddadp vs42,vs11, vs4
|
|
xsmaddadp vs43,vs30, vs4
|
|
|
|
xsmaddadp vs44,vs10, vs4
|
|
xsmaddadp vs45,vs29, vs4
|
|
|
|
xsmaddadp vs46,vs9, vs4
|
|
xsmaddadp vs47,vs28, vs4
|
|
#endif
|
|
|
|
stxssp v0,0(CO)
|
|
stxssp v1,4(CO)
|
|
|
|
stxssp v2,0(T1)
|
|
stxssp v3,4(T1)
|
|
|
|
stxssp v4,0(T2)
|
|
stxssp v5,4(T2)
|
|
|
|
stxssp v6,0(T3)
|
|
stxssp v7,4(T3)
|
|
|
|
stxssp v8,0(T4)
|
|
stxssp v9,4(T4)
|
|
|
|
stxssp v10,0(T5)
|
|
stxssp v11,4(T5)
|
|
|
|
stxssp v12,0(T6)
|
|
stxssp v13,4(T6)
|
|
|
|
stxssp v14,0(T7)
|
|
stxssp v15,4(T7)
|
|
|
|
|
|
addi CO,CO,8
|
|
.endm
|
|
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=8 and M=1
|
|
**********************************************************************************************/
|
|
.macro KERNEL8x1_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL8x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro Zero8x1
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
.endm
|
|
|
|
.macro KERNEL8x1
|
|
KERNEL8x1_1 AO,BO, 0
|
|
.endm
|
|
|
|
.macro KERNEL8x1_2
|
|
KERNEL8x1_2_1 AO,BO, 0
|
|
.endm
|
|
|
|
.macro KERNEL8x1_1 AREG,BREG,First
|
|
lxvwsx vs8, 0, \AREG
|
|
lxv vs26, 0(\BREG)
|
|
lxv vs27, 16(\BREG)
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs27, vs8
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
.endif
|
|
addi \AREG, \AREG, 4
|
|
addi \BREG, \BREG, 32
|
|
.endm
|
|
|
|
.macro KERNEL8x1_2_1 AREG,BREG,First
|
|
lxsd v4, 0(\AREG)
|
|
lxv vs26, 0(\BREG)
|
|
lxv vs27, 16(\BREG)
|
|
lxv vs28, 32(\BREG)
|
|
lxv vs29, 48(\BREG)
|
|
xxspltw vs8, vs36, 1
|
|
xxspltw vs9, vs36, 0
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs27, vs8
|
|
xvmulsp vs0, vs28, vs9
|
|
xvmulsp vs1, vs29, vs9
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs0, vs28, vs9
|
|
xvmaddasp vs1, vs29, vs9
|
|
.endif
|
|
addi \AREG, \AREG, 8
|
|
addi \BREG, \BREG, 64
|
|
.endm
|
|
|
|
.macro KERNEL8x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
|
|
lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
|
|
xxspltw vs8, vs4, 3
|
|
xxspltw vs9, vs4, 2
|
|
xxspltw vs10, vs4, 1
|
|
xxspltw vs11, vs4, 0
|
|
lxv vs26, DISP32(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs27, DISP32(\Index,16+\OffsetB)(\BREG)
|
|
lxv vs28, DISP32(\Index,32+\OffsetB)(\BREG)
|
|
lxv vs29, DISP32(\Index,48+\OffsetB)(\BREG)
|
|
lxv vs30, DISP32(\Index,64+ 0+\OffsetB)(\BREG)
|
|
lxv vs31, DISP32(\Index,64+16+\OffsetB)(\BREG)
|
|
lxv vs32, DISP32(\Index,64+32+\OffsetB)(\BREG)
|
|
lxv vs33, DISP32(\Index,64+48+\OffsetB)(\BREG)
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs27, vs8
|
|
xvmulsp vs0, vs28, vs9
|
|
xvmulsp vs1, vs29, vs9
|
|
xvmulsp vs0, vs30, vs10
|
|
xvmulsp vs1, vs31, vs10
|
|
xvmulsp vs0, vs32, vs11
|
|
xvmulsp vs1, vs33, vs11
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs0, vs28, vs9
|
|
xvmaddasp vs1, vs29, vs9
|
|
xvmaddasp vs0, vs30, vs10
|
|
xvmaddasp vs1, vs31, vs10
|
|
xvmaddasp vs0, vs32, vs11
|
|
xvmaddasp vs1, vs33, vs11
|
|
.endif
|
|
.if \IsLast==1
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
addi \BREG, \BREG, DISP32(\Index,128)
|
|
.endif
|
|
.endm
|
|
|
|
.macro SAVE8x1
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
add T4, T2, T10
|
|
add T5, T3, T10
|
|
add T6, T4, T10
|
|
add T7, T5, T10
|
|
/*convert alpha_r for multiply*/
|
|
xscvspdp vs4,alpha_r
|
|
/* v0 corresponds to vs32, do not forget*/
|
|
#if !defined(TRMMKERNEL)
|
|
lxssp v0,0(CO)
|
|
lxssp v2,0(T1)
|
|
lxssp v4,0(T2)
|
|
lxssp v6,0(T3)
|
|
lxssp v8,0(T4)
|
|
lxssp v10,0(T5)
|
|
lxssp v12,0(T6)
|
|
lxssp v14,0(T7)
|
|
#endif
|
|
xscvspdp vs24, vs0
|
|
xxspltw vs25, vs0, 1
|
|
xxspltw vs26, vs0, 2
|
|
xxspltw vs27, vs0, 3
|
|
xscvspdp vs25,vs25
|
|
xscvspdp vs26,vs26
|
|
xscvspdp vs27,vs27
|
|
xscvspdp vs28, vs1
|
|
xxspltw vs29, vs1, 1
|
|
xxspltw vs30, vs1, 2
|
|
xxspltw vs31, vs1, 3
|
|
xscvspdp vs29,vs29
|
|
xscvspdp vs30,vs30
|
|
xscvspdp vs31,vs31
|
|
#if defined(TRMMKERNEL)
|
|
xsmuldp vs32,vs27, vs4
|
|
xsmuldp vs34,vs26, vs4
|
|
xsmuldp vs36,vs25, vs4
|
|
xsmuldp vs38,vs24, vs4
|
|
xsmuldp vs40,vs31, vs4
|
|
xsmuldp vs42,vs30, vs4
|
|
xsmuldp vs44,vs29, vs4
|
|
xsmuldp vs46,vs28, vs4
|
|
#else
|
|
xsmaddadp vs32,vs27, vs4
|
|
xsmaddadp vs34,vs26, vs4
|
|
xsmaddadp vs36,vs25, vs4
|
|
xsmaddadp vs38,vs24, vs4
|
|
xsmaddadp vs40,vs31, vs4
|
|
xsmaddadp vs42,vs30, vs4
|
|
xsmaddadp vs44,vs29, vs4
|
|
xsmaddadp vs46,vs28, vs4
|
|
#endif
|
|
stxssp v0,0(CO)
|
|
stxssp v2,0(T1)
|
|
stxssp v4,0(T2)
|
|
stxssp v6,0(T3)
|
|
stxssp v8,0(T4)
|
|
stxssp v10,0(T5)
|
|
stxssp v12,0(T6)
|
|
stxssp v14,0(T7)
|
|
addi CO,CO,4
|
|
.endm
|
|
|
|
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=4 and M=16
|
|
**********************************************************************************************/
|
|
|
|
.macro LOAD4x16_1
|
|
LOAD4x16 1
|
|
.endm
|
|
|
|
.macro LOAD4x16_0
|
|
LOAD4x16 0
|
|
.endm
|
|
|
|
.macro KERNEL4x16_L1_L4 Index,IsLast
|
|
KERNEL4x16_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x16_I1_L4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x16_I1_L4_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x16_I1_L4_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x16_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
.macro KERNEL4x16_I1_L2_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x16_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro KERNEL4x16_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x16_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x16_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro Zero4X16
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
xxlxor vs34, vs34, vs34
|
|
xxlxor vs35, vs35, vs35
|
|
xxlxor vs36, vs36, vs36
|
|
xxlxor vs37, vs37, vs37
|
|
xxlxor vs38, vs38, vs38
|
|
xxlxor vs39, vs39, vs39
|
|
xxlxor vs40, vs40, vs40
|
|
xxlxor vs41, vs41, vs41
|
|
xxlxor vs42, vs42, vs42
|
|
xxlxor vs43, vs43, vs43
|
|
xxlxor vs44, vs44, vs44
|
|
xxlxor vs45, vs45, vs45
|
|
xxlxor vs46, vs46, vs46
|
|
xxlxor vs47, vs47, vs47
|
|
.endm
|
|
|
|
.macro LOAD4x16 Zero
|
|
|
|
lxv vs24, 0(BO)
|
|
lxv vs0, 0(AO)
|
|
lxv vs1, 16(AO)
|
|
lxv vs2, 32(AO)
|
|
lxv vs3, 48(AO)
|
|
xxperm vs26, vs24, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
xxpermdi vs27, vs26, vs26,2
|
|
|
|
.if \Zero==1
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
xxlxor vs34, vs34, vs34
|
|
xxlxor vs35, vs35, vs35
|
|
xxlxor vs36, vs36, vs36
|
|
xxlxor vs37, vs37, vs37
|
|
xxlxor vs38, vs38, vs38
|
|
xxlxor vs39, vs39, vs39
|
|
xxlxor vs40, vs40, vs40
|
|
xxlxor vs41, vs41, vs41
|
|
xxlxor vs42, vs42, vs42
|
|
xxlxor vs43, vs43, vs43
|
|
xxlxor vs44, vs44, vs44
|
|
xxlxor vs45, vs45, vs45
|
|
xxlxor vs46, vs46, vs46
|
|
xxlxor vs47, vs47, vs47
|
|
|
|
.endif
|
|
.endm
|
|
|
|
.macro END4x16_NORMAL
|
|
END4x16 0, AO, BO, 64,16
|
|
.endm
|
|
|
|
.macro END4x16 First, AREG, BREG, OffsetA, OffsetB
|
|
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs0,vs24
|
|
xvmulsp vs33, vs1,vs24
|
|
xvmulsp vs34, vs2,vs24
|
|
xvmulsp vs35, vs3,vs24
|
|
|
|
xvmulsp vs36, vs0,vs25
|
|
xvmulsp vs37, vs1,vs25
|
|
xvmulsp vs38, vs2,vs25
|
|
xvmulsp vs39, vs3,vs25
|
|
|
|
xvmulsp vs40, vs0,vs26
|
|
xvmulsp vs41, vs1,vs26
|
|
xvmulsp vs42, vs2,vs26
|
|
xvmulsp vs43, vs3,vs26
|
|
|
|
xvmulsp vs44, vs0,vs27
|
|
xvmulsp vs45, vs1,vs27
|
|
xvmulsp vs46, vs2,vs27
|
|
xvmulsp vs47, vs3,vs27
|
|
|
|
.else
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
xvmaddasp vs34, vs2,vs24
|
|
xvmaddasp vs35, vs3,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
xvmaddasp vs38, vs2,vs25
|
|
xvmaddasp vs39, vs3,vs25
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
xvmaddasp vs42, vs2,vs26
|
|
xvmaddasp vs43, vs3,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
xvmaddasp vs46, vs2,vs27
|
|
xvmaddasp vs47, vs3,vs27
|
|
|
|
.endif
|
|
.endm
|
|
|
|
.macro KERNEL4x16_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
|
|
|
|
lxv vs4, DISP64(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP64(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs6, DISP64(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs7, DISP64(\Index,48+\OffsetA)(\AREG)
|
|
|
|
xxperm vs10, vs8, permute_mask
|
|
xxpermdi vs9, vs8, vs8,2
|
|
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
xvmaddasp vs34, vs2,vs24
|
|
xvmaddasp vs35, vs3,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
xvmaddasp vs38, vs2,vs25
|
|
xvmaddasp vs39, vs3,vs25
|
|
|
|
xxpermdi vs11, vs10, vs10,2
|
|
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
xvmaddasp vs42, vs2,vs26
|
|
xvmaddasp vs43, vs3,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
xvmaddasp vs46, vs2,vs27
|
|
xvmaddasp vs47, vs3,vs27
|
|
|
|
|
|
|
|
lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
|
|
|
|
lxv vs0, DISP64(\Index,64+\OffsetA)(\AREG)
|
|
lxv vs1, DISP64(\Index,64+16+\OffsetA)(\AREG)
|
|
lxv vs2, DISP64(\Index,64+32+\OffsetA)(\AREG)
|
|
lxv vs3, DISP64(\Index,64+48+\OffsetA)(\AREG)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
|
|
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
xvmaddasp vs34, vs6,vs8
|
|
xvmaddasp vs35, vs7,vs8
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
xvmaddasp vs38, vs6,vs9
|
|
xvmaddasp vs39, vs7,vs9
|
|
|
|
xxpermdi vs27, vs26, vs26,2
|
|
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
xvmaddasp vs42, vs6,vs10
|
|
xvmaddasp vs43, vs7,vs10
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
xvmaddasp vs46, vs6,vs11
|
|
xvmaddasp vs47, vs7,vs11
|
|
|
|
|
|
lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
|
|
|
|
lxv vs4, DISP64(\Index,128+0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP64(\Index,128+16+\OffsetA)(\AREG)
|
|
lxv vs6, DISP64(\Index,128+32+\OffsetA)(\AREG)
|
|
lxv vs7, DISP64(\Index,128+48+\OffsetA)(\AREG)
|
|
|
|
xxperm vs10, vs8, permute_mask
|
|
xxpermdi vs9, vs8, vs8,2
|
|
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
xvmaddasp vs34, vs2,vs24
|
|
xvmaddasp vs35, vs3,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
xvmaddasp vs38, vs2,vs25
|
|
xvmaddasp vs39, vs3,vs25
|
|
|
|
xxpermdi vs11, vs10, vs10,2
|
|
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
xvmaddasp vs42, vs2,vs26
|
|
xvmaddasp vs43, vs3,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
xvmaddasp vs46, vs2,vs27
|
|
xvmaddasp vs47, vs3,vs27
|
|
|
|
|
|
|
|
.if \Complete==0
|
|
lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
|
|
|
|
lxv vs0, DISP64(\Index,192+\OffsetA)(\AREG)
|
|
lxv vs1, DISP64(\Index,192+16+\OffsetA)(\AREG)
|
|
lxv vs2, DISP64(\Index,192+32+\OffsetA)(\AREG)
|
|
lxv vs3, DISP64(\Index,192+48+\OffsetA)(\AREG)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
|
|
addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
|
|
addi \AREG, \AREG, DISP64(\Index,64*3+\OffsetA)
|
|
.else
|
|
|
|
addi \BREG, \BREG, DISP16(\Index,64)
|
|
addi \AREG, \AREG, DISP64(\Index,256)
|
|
.endif
|
|
.endif
|
|
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
xvmaddasp vs34, vs6,vs8
|
|
xvmaddasp vs35, vs7,vs8
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
xvmaddasp vs38, vs6,vs9
|
|
xvmaddasp vs39, vs7,vs9
|
|
|
|
.if \Complete==0
|
|
xxpermdi vs27, vs26, vs26,2
|
|
|
|
.endif
|
|
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
xvmaddasp vs42, vs6,vs10
|
|
xvmaddasp vs43, vs7,vs10
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
xvmaddasp vs46, vs6,vs11
|
|
xvmaddasp vs47, vs7,vs11
|
|
|
|
|
|
|
|
.endm
|
|
|
|
.macro KERNEL4x16 First
|
|
|
|
LOAD4x16 0
|
|
END4x16 \First, AO, BO, 64,16
|
|
.endm
|
|
|
|
.macro KERNEL4x16_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs6, DISP32(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs7, DISP32(\Index,48+\OffsetA)(\AREG)
|
|
|
|
xxperm vs10, vs8, permute_mask
|
|
xxpermdi vs9, vs8, vs8,2
|
|
.if \First==1
|
|
xvmulsp vs32, vs0,vs24
|
|
xvmulsp vs33, vs1,vs24
|
|
xvmulsp vs34, vs2,vs24
|
|
xvmulsp vs35, vs3,vs24
|
|
|
|
xvmulsp vs36, vs0,vs25
|
|
xvmulsp vs37, vs1,vs25
|
|
xvmulsp vs38, vs2,vs25
|
|
xvmulsp vs39, vs3,vs25
|
|
.else
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
xvmaddasp vs34, vs2,vs24
|
|
xvmaddasp vs35, vs3,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
xvmaddasp vs38, vs2,vs25
|
|
xvmaddasp vs39, vs3,vs25
|
|
.endif
|
|
|
|
xxpermdi vs11, vs10, vs10,2
|
|
|
|
.if \First==1
|
|
xvmulsp vs40, vs0,vs26
|
|
xvmulsp vs41, vs1,vs26
|
|
xvmulsp vs42, vs2,vs26
|
|
xvmulsp vs43, vs3,vs26
|
|
|
|
xvmulsp vs44, vs0,vs27
|
|
xvmulsp vs45, vs1,vs27
|
|
xvmulsp vs46, vs2,vs27
|
|
xvmulsp vs47, vs3,vs27
|
|
|
|
|
|
.else
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
xvmaddasp vs42, vs2,vs26
|
|
xvmaddasp vs43, vs3,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
xvmaddasp vs46, vs2,vs27
|
|
xvmaddasp vs47, vs3,vs27
|
|
|
|
|
|
.endif
|
|
.if \Complete==0
|
|
lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
|
|
lxv vs0, DISP32(\Index,64+\OffsetA)(\AREG)
|
|
lxv vs1, DISP32(\Index,64+16+\OffsetA)(\AREG)
|
|
lxv vs2, DISP32(\Index,64+32+\OffsetA)(\AREG)
|
|
lxv vs3, DISP32(\Index,64+48+\OffsetA)(\AREG)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
|
|
addi \AREG, \AREG, DISP32(\Index,64+\OffsetA)
|
|
|
|
.else
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
addi \AREG, \AREG, DISP32(\Index,128)
|
|
.endif
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs4,vs8
|
|
xvmulsp vs33, vs5,vs8
|
|
xvmulsp vs34, vs6,vs8
|
|
xvmulsp vs35, vs7,vs8
|
|
|
|
xvmulsp vs36, vs4,vs9
|
|
xvmulsp vs37, vs5,vs9
|
|
xvmulsp vs38, vs6,vs9
|
|
xvmulsp vs39, vs7,vs9
|
|
.else
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
xvmaddasp vs34, vs6,vs8
|
|
xvmaddasp vs35, vs7,vs8
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
xvmaddasp vs38, vs6,vs9
|
|
xvmaddasp vs39, vs7,vs9
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
xxpermdi vs27, vs26, vs26,2
|
|
|
|
.endif
|
|
.if \First==1
|
|
xvmulsp vs40, vs4,vs10
|
|
xvmulsp vs41, vs5,vs10
|
|
xvmulsp vs42, vs6,vs10
|
|
xvmulsp vs43, vs7,vs10
|
|
|
|
xvmulsp vs44, vs4,vs11
|
|
xvmulsp vs45, vs5,vs11
|
|
xvmulsp vs46, vs6,vs11
|
|
xvmulsp vs47, vs7,vs11
|
|
|
|
|
|
|
|
.else
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
xvmaddasp vs42, vs6,vs10
|
|
xvmaddasp vs43, vs7,vs10
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
xvmaddasp vs46, vs6,vs11
|
|
xvmaddasp vs47, vs7,vs11
|
|
|
|
|
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE4x16
|
|
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
|
|
|
|
|
|
xxmrglw vs8, vs32, vs44
|
|
xxmrglw vs10, vs36, vs40
|
|
|
|
xxmrghw vs1, vs32, vs44
|
|
xxmrghw vs0, vs36, vs40
|
|
|
|
xxmrglw vs12, vs33, vs45
|
|
xxmrglw vs14, vs37, vs41
|
|
|
|
xxmrghw vs2, vs37, vs41
|
|
xxmrghw vs3, vs33, vs45
|
|
|
|
xxmrglw vs16, vs34, vs46
|
|
xxmrglw vs18, vs38, vs42
|
|
|
|
xxlor vs9, vs8, vs8
|
|
xxlor vs11, vs10, vs10
|
|
|
|
xxmrghw vs4, vs38, vs42
|
|
xxmrghw vs5, vs34, vs46
|
|
|
|
xxlor vs13, vs12, vs12
|
|
xxlor vs15, vs14, vs14
|
|
|
|
xxmrglw vs24, vs35, vs47
|
|
xxmrglw vs26, vs39, vs43
|
|
|
|
xxlor vs17, vs16, vs16
|
|
xxlor vs19, vs18, vs18
|
|
|
|
xxmrghw vs30, vs39, vs43
|
|
xxmrghw vs31, vs35, vs47
|
|
|
|
xxperm vs8, vs0, save_permute_1
|
|
xxperm vs10, vs1, save_permute_1
|
|
xxperm vs9, vs0, save_permute_2
|
|
xxperm vs11, vs1, save_permute_2
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs32, 0(CO)
|
|
lxv vs33, 16(CO)
|
|
lxv vs34, 32(CO)
|
|
lxv vs35, 48(CO)
|
|
#endif
|
|
xxlor vs25, vs24, vs24
|
|
xxlor vs27, vs26, vs26
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs36, 0(T1)
|
|
lxv vs37, 16(T1)
|
|
lxv vs38, 32(T1)
|
|
lxv vs39, 48(T1)
|
|
#endif
|
|
#ifndef TRMMKERNEL
|
|
lxv vs40, 0(T2)
|
|
lxv vs41, 16(T2)
|
|
lxv vs42, 32(T2)
|
|
lxv vs43, 48(T2)
|
|
#endif
|
|
#ifndef TRMMKERNEL
|
|
lxv vs44, 0(T3)
|
|
lxv vs45, 16(T3)
|
|
lxv vs46, 32(T3)
|
|
lxv vs47, 48(T3)
|
|
#endif
|
|
|
|
xxperm vs12, vs2, save_permute_1
|
|
xxperm vs14, vs3, save_permute_1
|
|
|
|
xxperm vs13, vs2, save_permute_2
|
|
xxperm vs15, vs3, save_permute_2
|
|
|
|
xxperm vs16, vs4, save_permute_1
|
|
xxperm vs18, vs5, save_permute_1
|
|
|
|
xxperm vs17, vs4, save_permute_2
|
|
xxperm vs19, vs5, save_permute_2
|
|
|
|
xxperm vs24, vs30, save_permute_1
|
|
xxperm vs26, vs31, save_permute_1
|
|
|
|
xxperm vs25, vs30, save_permute_2
|
|
xxperm vs27, vs31, save_permute_2
|
|
|
|
|
|
/* multiply add normal way */
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs32, vs8, alpha_r
|
|
xvmulsp vs33, vs12, alpha_r
|
|
xvmulsp vs34, vs16, alpha_r
|
|
xvmulsp vs35, vs24, alpha_r
|
|
xvmulsp vs36, vs9, alpha_r
|
|
xvmulsp vs37, vs13, alpha_r
|
|
xvmulsp vs38, vs17, alpha_r
|
|
xvmulsp vs39, vs25, alpha_r
|
|
#else
|
|
xvmaddasp vs32, vs8, alpha_r
|
|
xvmaddasp vs33, vs12, alpha_r
|
|
xvmaddasp vs34, vs16, alpha_r
|
|
xvmaddasp vs35, vs24, alpha_r
|
|
xvmaddasp vs36, vs9, alpha_r
|
|
xvmaddasp vs37, vs13, alpha_r
|
|
xvmaddasp vs38, vs17, alpha_r
|
|
xvmaddasp vs39, vs25, alpha_r
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs40, vs10, alpha_r
|
|
xvmulsp vs41, vs14, alpha_r
|
|
xvmulsp vs42, vs18, alpha_r
|
|
xvmulsp vs43, vs26, alpha_r
|
|
xvmulsp vs44, vs11, alpha_r
|
|
xvmulsp vs45, vs15, alpha_r
|
|
xvmulsp vs46, vs19, alpha_r
|
|
xvmulsp vs47, vs27, alpha_r
|
|
#else
|
|
|
|
xvmaddasp vs40, vs10, alpha_r
|
|
xvmaddasp vs41, vs14, alpha_r
|
|
xvmaddasp vs42, vs18, alpha_r
|
|
xvmaddasp vs43, vs26, alpha_r
|
|
xvmaddasp vs44, vs11, alpha_r
|
|
xvmaddasp vs45, vs15, alpha_r
|
|
xvmaddasp vs46, vs19, alpha_r
|
|
xvmaddasp vs47, vs27, alpha_r
|
|
|
|
#endif
|
|
|
|
stxv vs32, 0(CO)
|
|
stxv vs33, 16(CO)
|
|
stxv vs34, 32(CO)
|
|
stxv vs35, 48(CO)
|
|
|
|
stxv vs36, 0(T1)
|
|
stxv vs37, 16(T1)
|
|
stxv vs38, 32(T1)
|
|
stxv vs39, 48(T1)
|
|
|
|
stxv vs40, 0(T2)
|
|
stxv vs41, 16(T2)
|
|
stxv vs42, 32(T2)
|
|
stxv vs43, 48(T2)
|
|
stxv vs44, 0(T3)
|
|
stxv vs45, 16(T3)
|
|
stxv vs46, 32(T3)
|
|
stxv vs47, 48(T3)
|
|
|
|
addi CO,CO,64
|
|
|
|
|
|
.endm
|
|
|
|
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=4 and M=8
|
|
**********************************************************************************************/
|
|
|
|
.macro LOAD4x8_1
|
|
LOAD4x8 1
|
|
.endm
|
|
|
|
.macro LOAD4x8_0
|
|
LOAD4x8 0
|
|
.endm
|
|
|
|
.macro KERNEL4x8_L1_L4 Index,IsLast
|
|
KERNEL4x8_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x8_I1_L4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x8_I1_L4_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x8_I1_L4_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x8_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
.macro KERNEL4x8_I1_L2_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x8_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro KERNEL4x8_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x8_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x8_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro END4x8_NORMAL
|
|
END4x8 0, AO, BO, 32,16
|
|
.endm
|
|
|
|
.macro Zero4X8
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
|
|
xxlxor vs36, vs36, vs36
|
|
xxlxor vs37, vs37, vs37
|
|
|
|
xxlxor vs40, vs40, vs40
|
|
xxlxor vs41, vs41, vs41
|
|
|
|
xxlxor vs44, vs44, vs44
|
|
xxlxor vs45, vs45, vs45
|
|
|
|
.endm
|
|
|
|
.macro LOAD4x8 Zero
|
|
|
|
lxv vs24, 0(BO)
|
|
lxv vs0, 0(AO)
|
|
lxv vs1, 16(AO)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
|
|
xxpermdi vs27, vs26, vs26,2
|
|
|
|
.if \Zero==1
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
xxlxor vs36, vs36, vs36
|
|
xxlxor vs37, vs37, vs37
|
|
xxlxor vs40, vs40, vs40
|
|
xxlxor vs41, vs41, vs41
|
|
xxlxor vs44, vs44, vs44
|
|
xxlxor vs45, vs45, vs45
|
|
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro END4x8 First, AREG, BREG, OffsetA, OffsetB
|
|
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs0,vs24
|
|
xvmulsp vs33, vs1,vs24
|
|
|
|
xvmulsp vs36, vs0,vs25
|
|
xvmulsp vs37, vs1,vs25
|
|
|
|
xvmulsp vs40, vs0,vs26
|
|
xvmulsp vs41, vs1,vs26
|
|
|
|
xvmulsp vs44, vs0,vs27
|
|
xvmulsp vs45, vs1,vs27
|
|
|
|
|
|
.else
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
|
|
|
|
.endif
|
|
.endm
|
|
|
|
.macro KERNEL4x8_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs8, DISP16(\Index, 0+\OffsetB)(\BREG)
|
|
|
|
lxv vs4, DISP32(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP32(\Index,16+\OffsetA)(\AREG)
|
|
|
|
xxperm vs10, vs8, permute_mask
|
|
xxpermdi vs9, vs8, vs8,2
|
|
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
|
|
xxpermdi vs11, vs10, vs10,2
|
|
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
|
|
|
|
|
|
lxv vs24, DISP16(\Index,16+\OffsetB)(\BREG)
|
|
|
|
lxv vs0, DISP32(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs1, DISP32(\Index,32+16+\OffsetA)(\AREG)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
|
|
xxpermdi vs27, vs26, vs26,2
|
|
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
|
|
|
|
|
|
lxv vs8, DISP16(\Index,32+\OffsetB)(\BREG)
|
|
|
|
lxv vs4, DISP32(\Index,64+0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP32(\Index,64+16+\OffsetA)(\AREG)
|
|
|
|
xxperm vs10, vs8, permute_mask
|
|
xxpermdi vs9, vs8, vs8,2
|
|
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
|
|
xxpermdi vs11, vs10, vs10,2
|
|
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
|
|
|
|
|
|
.if \Complete==0
|
|
lxv vs24, DISP16(\Index,48+\OffsetB)(\BREG)
|
|
|
|
lxv vs0, DISP32(\Index,96+\OffsetA)(\AREG)
|
|
lxv vs1, DISP32(\Index,96+16+\OffsetA)(\AREG)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
|
|
addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
|
|
addi \AREG, \AREG, DISP32(\Index,32*3+\OffsetA)
|
|
.else
|
|
|
|
addi \BREG, \BREG, DISP16(\Index,64)
|
|
addi \AREG, \AREG, DISP32(\Index,128)
|
|
.endif
|
|
.endif
|
|
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
|
|
.if \Complete==0
|
|
xxpermdi vs27, vs26, vs26,2
|
|
|
|
.endif
|
|
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
|
|
|
|
|
|
.endm
|
|
|
|
.macro KERNEL4x8 First
|
|
|
|
LOAD4x8 0
|
|
END4x8 \First, AO, BO, 32,16
|
|
.endm
|
|
|
|
.macro KERNEL4x8_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs5, DISP16(\Index,16+\OffsetA)(\AREG)
|
|
|
|
xxperm vs10, vs8, permute_mask
|
|
xxpermdi vs9, vs8, vs8,2
|
|
.if \First==1
|
|
xvmulsp vs32, vs0,vs24
|
|
xvmulsp vs33, vs1,vs24
|
|
|
|
xvmulsp vs36, vs0,vs25
|
|
xvmulsp vs37, vs1,vs25
|
|
|
|
.else
|
|
xvmaddasp vs32, vs0,vs24
|
|
xvmaddasp vs33, vs1,vs24
|
|
|
|
xvmaddasp vs36, vs0,vs25
|
|
xvmaddasp vs37, vs1,vs25
|
|
|
|
.endif
|
|
|
|
xxpermdi vs11, vs10, vs10,2
|
|
|
|
.if \First==1
|
|
xvmulsp vs40, vs0,vs26
|
|
xvmulsp vs41, vs1,vs26
|
|
|
|
xvmulsp vs44, vs0,vs27
|
|
xvmulsp vs45, vs1,vs27
|
|
|
|
|
|
.else
|
|
xvmaddasp vs40, vs0,vs26
|
|
xvmaddasp vs41, vs1,vs26
|
|
|
|
xvmaddasp vs44, vs0,vs27
|
|
xvmaddasp vs45, vs1,vs27
|
|
|
|
|
|
.endif
|
|
.if \Complete==0
|
|
lxv vs24, DISP8(\Index,16+\OffsetB)(\BREG)
|
|
|
|
lxv vs0, DISP16(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs1, DISP16(\Index,32+16+\OffsetA)(\AREG)
|
|
|
|
xxperm vs26, vs24, permute_mask
|
|
xxpermdi vs25, vs24, vs24,2
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
|
|
addi \AREG, \AREG, DISP16(\Index,32+\OffsetA)
|
|
|
|
.else
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
.endif
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs4,vs8
|
|
xvmulsp vs33, vs5,vs8
|
|
|
|
xvmulsp vs36, vs4,vs9
|
|
xvmulsp vs37, vs5,vs9
|
|
|
|
.else
|
|
xvmaddasp vs32, vs4,vs8
|
|
xvmaddasp vs33, vs5,vs8
|
|
|
|
xvmaddasp vs36, vs4,vs9
|
|
xvmaddasp vs37, vs5,vs9
|
|
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
xxpermdi vs27, vs26, vs26,2
|
|
|
|
.endif
|
|
.if \First==1
|
|
xvmulsp vs40, vs4,vs10
|
|
xvmulsp vs41, vs5,vs10
|
|
|
|
xvmulsp vs44, vs4,vs11
|
|
xvmulsp vs45, vs5,vs11
|
|
|
|
.else
|
|
xvmaddasp vs40, vs4,vs10
|
|
xvmaddasp vs41, vs5,vs10
|
|
|
|
xvmaddasp vs44, vs4,vs11
|
|
xvmaddasp vs45, vs5,vs11
|
|
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE4x8
|
|
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
|
|
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs34, 0(CO)
|
|
lxv vs35, 16(CO)
|
|
lxv vs38, 0(T1)
|
|
lxv vs39, 16(T1)
|
|
lxv vs42, 0(T2)
|
|
lxv vs43, 16(T2)
|
|
lxv vs46, 0(T3)
|
|
lxv vs47, 16(T3)
|
|
|
|
|
|
#endif
|
|
|
|
xxmrglw vs8, vs32, vs44
|
|
xxmrglw vs10, vs36, vs40
|
|
|
|
xxmrghw vs1, vs32, vs44
|
|
xxmrghw vs0, vs36, vs40
|
|
|
|
xxmrglw vs12, vs33, vs45
|
|
xxmrglw vs14, vs37, vs41
|
|
|
|
xxmrghw vs2, vs37, vs41
|
|
xxmrghw vs3, vs33, vs45
|
|
|
|
xxlor vs9, vs8, vs8
|
|
xxlor vs11, vs10, vs10
|
|
|
|
xxlor vs13, vs12, vs12
|
|
xxlor vs15, vs14, vs14
|
|
|
|
xxperm vs8, vs0, save_permute_1
|
|
xxperm vs10, vs1, save_permute_1
|
|
xxperm vs9, vs0, save_permute_2
|
|
xxperm vs11, vs1, save_permute_2
|
|
|
|
xxperm vs12, vs2, save_permute_1
|
|
xxperm vs14, vs3, save_permute_1
|
|
|
|
xxperm vs13, vs2, save_permute_2
|
|
xxperm vs15, vs3, save_permute_2
|
|
|
|
|
|
/* multiply add normal way */
|
|
|
|
#ifdef TRMMKERNEL
|
|
xvmulsp vs34, vs8, alpha_r
|
|
xvmulsp vs35, vs12, alpha_r
|
|
xvmulsp vs38, vs9, alpha_r
|
|
xvmulsp vs39, vs13, alpha_r
|
|
xvmulsp vs42, vs10, alpha_r
|
|
xvmulsp vs43, vs14, alpha_r
|
|
xvmulsp vs46, vs11, alpha_r
|
|
xvmulsp vs47, vs15, alpha_r
|
|
#else
|
|
xvmaddasp vs34, vs8, alpha_r
|
|
xvmaddasp vs35, vs12, alpha_r
|
|
xvmaddasp vs38, vs9, alpha_r
|
|
xvmaddasp vs39, vs13, alpha_r
|
|
xvmaddasp vs42, vs10, alpha_r
|
|
xvmaddasp vs43, vs14, alpha_r
|
|
xvmaddasp vs46, vs11, alpha_r
|
|
xvmaddasp vs47, vs15, alpha_r
|
|
#endif
|
|
|
|
|
|
stxv vs34, 0(CO)
|
|
stxv vs35, 16(CO)
|
|
stxv vs38, 0(T1)
|
|
stxv vs39, 16(T1)
|
|
stxv vs42, 0(T2)
|
|
stxv vs43, 16(T2)
|
|
stxv vs46, 0(T3)
|
|
stxv vs47, 16(T3)
|
|
|
|
|
|
addi CO,CO,32
|
|
|
|
.endm
|
|
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=4 and M=4
|
|
**********************************************************************************************/
|
|
|
|
.macro LOAD4x4_1
|
|
LOAD4x4 1
|
|
.endm
|
|
|
|
.macro LOAD4x4_0
|
|
LOAD4x4 0
|
|
.endm
|
|
|
|
.macro KERNEL4x4_L1_L4 Index,IsLast
|
|
KERNEL4x4_L1_L4_I AO,BO, 0,0, \Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x4_I1_L4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x4_I1_L4_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x4_I1_L4_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x4_L1_L4_I AO,BO, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
.macro KERNEL4x4_I1_L2_3 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x4_L1_L2_I AO,BO,0, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro KERNEL4x4_I2_L4_2 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,0
|
|
.endm
|
|
|
|
.macro KERNEL4x4_I2_L4_3 AREG,BREG,OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x4_L1_L4_I \AREG,\BREG, \OffsetA,\OffsetB,\Index,\IsLast,1
|
|
.endm
|
|
|
|
.macro Zero4X4
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
xxlxor vs34, vs34, vs34
|
|
xxlxor vs35, vs35, vs35
|
|
|
|
.endm
|
|
|
|
.macro LOAD4x4 Zero
|
|
|
|
lxv vs0, 0(AO)
|
|
lxv vs24, 0(BO)
|
|
|
|
|
|
|
|
xxperm vs2, vs0, permute_mask
|
|
xxpermdi vs1, vs0, vs0,2
|
|
xxpermdi vs3, vs2, vs2,2
|
|
|
|
.if \Zero==1
|
|
xxlxor vs32, vs32, vs32
|
|
xxlxor vs33, vs33, vs33
|
|
xxlxor vs34, vs34, vs34
|
|
xxlxor vs35, vs35, vs35
|
|
|
|
.endif
|
|
.endm
|
|
|
|
.macro END4x4_NORMAL
|
|
END4x4 0, AO, BO, 16,16
|
|
.endm
|
|
|
|
.macro END4x4 First, AREG, BREG, OffsetA, OffsetB
|
|
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs24, vs0
|
|
xvmulsp vs33, vs24, vs1
|
|
xvmulsp vs34, vs24, vs2
|
|
xvmulsp vs35, vs24, vs3
|
|
.else
|
|
xvmaddasp vs32, vs24, vs0
|
|
xvmaddasp vs33, vs24, vs1
|
|
xvmaddasp vs34, vs24, vs2
|
|
xvmaddasp vs35, vs24, vs3
|
|
|
|
|
|
.endif
|
|
.endm
|
|
|
|
.macro KERNEL4x4_L1_L4_I AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs4, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
|
|
|
|
xxperm vs6, vs4, permute_mask
|
|
xxpermdi vs5, vs4, vs4,2
|
|
xxpermdi vs7, vs6, vs6,2
|
|
|
|
xvmaddasp vs32, vs24, vs0
|
|
xvmaddasp vs33, vs24, vs1
|
|
xvmaddasp vs34, vs24, vs2
|
|
xvmaddasp vs35, vs24, vs3
|
|
|
|
|
|
lxv vs0, DISP16(\Index, 16+\OffsetA)(\AREG)
|
|
lxv vs24, DISP16(\Index, 16+\OffsetB)(\BREG)
|
|
|
|
xxperm vs2, vs0, permute_mask
|
|
xxpermdi vs1, vs0, vs0,2
|
|
xxpermdi vs3, vs2, vs2,2
|
|
|
|
xvmaddasp vs32, vs26, vs4
|
|
xvmaddasp vs33, vs26, vs5
|
|
xvmaddasp vs34, vs26, vs6
|
|
xvmaddasp vs35, vs26, vs7
|
|
|
|
|
|
|
|
lxv vs4, DISP16(\Index, 32+\OffsetA)(\AREG)
|
|
lxv vs26, DISP16(\Index, 32+\OffsetB)(\BREG)
|
|
|
|
xxperm vs6, vs4, permute_mask
|
|
xxpermdi vs5, vs4, vs4,2
|
|
xxpermdi vs7, vs6, vs6,2
|
|
|
|
xvmaddasp vs32, vs24, vs0
|
|
xvmaddasp vs33, vs24, vs1
|
|
xvmaddasp vs34, vs24, vs2
|
|
xvmaddasp vs35, vs24, vs3
|
|
|
|
|
|
.if \Complete==0
|
|
|
|
lxv vs0, DISP16(\Index, 48+\OffsetA)(\AREG)
|
|
lxv vs24, DISP16(\Index, 48+\OffsetB)(\BREG)
|
|
|
|
xxperm vs2, vs0, permute_mask
|
|
xxpermdi vs1, vs0, vs0,2
|
|
xxpermdi vs3, vs2, vs2,2
|
|
.endif
|
|
xvmaddasp vs32, vs26, vs4
|
|
xvmaddasp vs33, vs26, vs5
|
|
xvmaddasp vs34, vs26, vs6
|
|
xvmaddasp vs35, vs26, vs7
|
|
|
|
|
|
|
|
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \AREG, \AREG, DISP16(\Index,16*3+\OffsetA)
|
|
addi \BREG, \BREG, DISP16(\Index,16*3+\OffsetB)
|
|
|
|
.else
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
addi \BREG, \BREG, DISP16(\Index,64)
|
|
|
|
.endif
|
|
.endif
|
|
|
|
|
|
.endm
|
|
|
|
.macro KERNEL4x4 First
|
|
LOAD4x4 0
|
|
END4x4 \First, AO, BO, 16,16
|
|
.endm
|
|
|
|
.macro KERNEL4x4_L1_L2_I AREG,BREG,First,OffsetA,OffsetB, Index,IsLast ,Complete
|
|
|
|
lxv vs4, DISP8(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
|
|
xxperm vs6, vs4, permute_mask
|
|
xxpermdi vs5, vs4, vs4,2
|
|
xxpermdi vs7, vs6, vs6,2
|
|
.if \First==1
|
|
xvmulsp vs32, vs24, vs0
|
|
xvmulsp vs33, vs24, vs1
|
|
xvmulsp vs34, vs24, vs2
|
|
xvmulsp vs35, vs24, vs3
|
|
|
|
.else
|
|
xvmaddasp vs32, vs24, vs0
|
|
xvmaddasp vs33, vs24, vs1
|
|
xvmaddasp vs34, vs24, vs2
|
|
xvmaddasp vs35, vs24, vs3
|
|
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
|
|
lxv vs0, DISP8(\Index, 16+\OffsetA)(\AREG)
|
|
lxv vs24, DISP8(\Index, 16+\OffsetB)(\BREG)
|
|
|
|
xxperm vs2, vs0, permute_mask
|
|
xxpermdi vs1, vs0, vs0,2
|
|
xxpermdi vs3, vs2, vs2,2
|
|
.endif
|
|
|
|
.if \First==1
|
|
xvmulsp vs32, vs26, vs4
|
|
xvmulsp vs33, vs26, vs5
|
|
xvmulsp vs34, vs26, vs6
|
|
xvmulsp vs35, vs26, vs7
|
|
|
|
|
|
.else
|
|
xvmaddasp vs32, vs26, vs4
|
|
xvmaddasp vs33, vs26, vs5
|
|
xvmaddasp vs34, vs26, vs6
|
|
xvmaddasp vs35, vs26, vs7
|
|
|
|
.endif
|
|
|
|
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \AREG, \AREG, DISP8(\Index,16+\OffsetA)
|
|
addi \BREG, \BREG, DISP8(\Index,16+\OffsetB)
|
|
|
|
.else
|
|
addi \AREG, \AREG, DISP8(\Index,32)
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
|
|
.endif
|
|
.endif
|
|
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE4x4
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
#if !defined(TRMMKERNEL)
|
|
lxv vs36, 0(CO)
|
|
lxv vs37, 0(T1)
|
|
#endif
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
#if !defined(TRMMKERNEL)
|
|
lxv vs38, 0(T2)
|
|
lxv vs39, 0(T3)
|
|
#endif
|
|
|
|
xxmrglw vs0, vs35,vs32
|
|
xxmrglw vs1, vs34,vs33
|
|
xxmrglw vs4, vs32,vs35
|
|
xxmrglw vs5, vs33,vs34
|
|
|
|
|
|
xxmrghw vs2, vs35,vs32
|
|
xxmrghw vs3, vs34,vs33
|
|
xxmrghw vs6, vs32,vs35
|
|
xxmrghw vs7, vs33,vs34
|
|
|
|
xxmrgld vs24, vs1, vs0
|
|
xxmrghd vs25,vs5,vs4
|
|
|
|
xxmrgld vs26, vs2, vs3
|
|
xxmrghd vs27,vs6,vs7
|
|
|
|
#if defined(TRMMKERNEL)
|
|
xvmulsp vs36, vs24, alpha_r
|
|
xvmulsp vs37, vs25, alpha_r
|
|
xvmulsp vs38, vs26, alpha_r
|
|
xvmulsp vs39, vs27, alpha_r
|
|
#else
|
|
xvmaddasp vs36, vs24, alpha_r
|
|
xvmaddasp vs37, vs25, alpha_r
|
|
xvmaddasp vs38, vs26, alpha_r
|
|
xvmaddasp vs39, vs27, alpha_r
|
|
#endif
|
|
stxv vs36, 0(CO)
|
|
stxv vs37, 0(T1)
|
|
stxv vs38, 0(T2)
|
|
stxv vs39, 0(T3)
|
|
|
|
|
|
|
|
addi CO,CO,16
|
|
.endm
|
|
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=4 and M=2
|
|
**********************************************************************************************/
|
|
|
|
|
|
.macro KERNEL4x2_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x2_I_2 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
|
|
|
|
.macro Zero4x2
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs2, vs2, vs2
|
|
|
|
.endm
|
|
|
|
.macro KERNEL4x2
|
|
KERNEL4x2_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL4x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 0
|
|
xxspltw vs9, vs36, 1
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs2, vs26, vs9
|
|
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs2, vs26, vs9
|
|
|
|
.endif
|
|
|
|
addi \AREG, \AREG, DISP2(\Index,8)
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
|
|
.endm
|
|
|
|
.macro KERNEL4x2_I_2 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs26, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs28, DISP8(\Index,16+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs4, 2
|
|
xxspltw vs9, vs4, 3
|
|
xxspltw vs10, vs4, 0
|
|
xxspltw vs11, vs4, 1
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs2, vs26, vs9
|
|
|
|
xvmulsp vs0, vs28, vs10
|
|
xvmulsp vs2, vs28, vs11
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs2, vs26, vs9
|
|
|
|
xvmaddasp vs0, vs28, vs10
|
|
xvmaddasp vs2, vs28, vs11
|
|
.endif
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE4x2
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
/*convert alpha_r for multiply*/
|
|
xscvspdp vs4,alpha_r
|
|
/* v0 corresponds to vs32, do not forget*/
|
|
#if !defined(TRMMKERNEL)
|
|
lxssp v0,0(CO)
|
|
lxssp v1,4(CO)
|
|
|
|
lxssp v2,0(T1)
|
|
lxssp v3,4(T1)
|
|
|
|
lxssp v4,0(T2)
|
|
lxssp v5,4(T2)
|
|
|
|
lxssp v6,0(T3)
|
|
lxssp v7,4(T3)
|
|
|
|
|
|
#endif
|
|
xscvspdp vs5, vs2
|
|
xxspltw vs6, vs2, 1
|
|
xxspltw vs7, vs2, 2
|
|
xxspltw vs8, vs2, 3
|
|
xscvspdp vs6,vs6
|
|
xscvspdp vs7,vs7
|
|
xscvspdp vs8,vs8
|
|
|
|
xscvspdp vs24, vs0
|
|
xxspltw vs25, vs0, 1
|
|
xxspltw vs26, vs0, 2
|
|
xxspltw vs27, vs0, 3
|
|
xscvspdp vs25,vs25
|
|
xscvspdp vs26,vs26
|
|
xscvspdp vs27,vs27
|
|
|
|
|
|
#if defined(TRMMKERNEL)
|
|
xsmuldp vs32,vs8, vs4
|
|
xsmuldp vs33,vs27, vs4
|
|
|
|
xsmuldp vs34,vs7, vs4
|
|
xsmuldp vs35,vs26, vs4
|
|
|
|
xsmuldp vs36,vs6, vs4
|
|
xsmuldp vs37,vs25, vs4
|
|
|
|
xsmuldp vs38,vs5, vs4
|
|
xsmuldp vs39,vs24, vs4
|
|
|
|
|
|
#else
|
|
xsmaddadp vs32,vs8, vs4
|
|
xsmaddadp vs33,vs27, vs4
|
|
|
|
xsmaddadp vs34,vs7, vs4
|
|
xsmaddadp vs35,vs26, vs4
|
|
|
|
xsmaddadp vs36,vs6, vs4
|
|
xsmaddadp vs37,vs25, vs4
|
|
|
|
xsmaddadp vs38,vs5, vs4
|
|
xsmaddadp vs39,vs24, vs4
|
|
|
|
|
|
#endif
|
|
|
|
stxssp v0,0(CO)
|
|
stxssp v1,4(CO)
|
|
|
|
stxssp v2,0(T1)
|
|
stxssp v3,4(T1)
|
|
|
|
stxssp v4,0(T2)
|
|
stxssp v5,4(T2)
|
|
|
|
stxssp v6,0(T3)
|
|
stxssp v7,4(T3)
|
|
|
|
|
|
|
|
|
|
addi CO,CO,8
|
|
.endm
|
|
|
|
|
|
/**********************************************************************************************
|
|
* Macros for N=4 and M=1
|
|
**********************************************************************************************/
|
|
.macro KERNEL4x1_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL4x1_I_4 AO,BO, 0, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro Zero4x1
|
|
xxlxor vs0, vs0, vs0
|
|
.endm
|
|
|
|
.macro KERNEL4x1
|
|
KERNEL4x1_1 AO,BO, 0
|
|
.endm
|
|
|
|
.macro KERNEL4x1_2
|
|
KERNEL4x1_2_1 AO,BO, 0
|
|
.endm
|
|
|
|
.macro KERNEL4x1_1 AREG,BREG,First
|
|
lxvwsx vs8, 0, \AREG
|
|
lxv vs26, 0(\BREG)
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
.endif
|
|
addi \AREG, \AREG, 4
|
|
addi \BREG, \BREG, 16
|
|
.endm
|
|
|
|
.macro KERNEL4x1_2_1 AREG,BREG,First
|
|
lxsd v4, 0(\AREG)
|
|
lxv vs26, 0(\BREG)
|
|
lxv vs28, 16(\BREG)
|
|
xxspltw vs8, vs36, 1
|
|
xxspltw vs9, vs36, 0
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs0, vs28, vs9
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs0, vs28, vs9
|
|
.endif
|
|
addi \AREG, \AREG, 8
|
|
addi \BREG, \BREG, 32
|
|
.endm
|
|
|
|
.macro KERNEL4x1_I_4 AREG,BREG,First,OffsetA,OffsetB, Index,IsLast
|
|
lxv vs4, DISP4(\Index, 0+\OffsetA)(\AREG)
|
|
xxspltw vs8, vs4, 3
|
|
xxspltw vs9, vs4, 2
|
|
xxspltw vs10, vs4, 1
|
|
xxspltw vs11, vs4, 0
|
|
lxv vs26, DISP16(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs28, DISP16(\Index,16+\OffsetB)(\BREG)
|
|
lxv vs30, DISP16(\Index,32+\OffsetB)(\BREG)
|
|
lxv vs32, DISP16(\Index,48+\OffsetB)(\BREG)
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs0, vs28, vs9
|
|
xvmulsp vs0, vs30, vs10
|
|
xvmulsp vs0, vs32, vs11
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs0, vs28, vs9
|
|
xvmaddasp vs0, vs30, vs10
|
|
xvmaddasp vs0, vs32, vs11
|
|
.endif
|
|
.if \IsLast==1
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
addi \BREG, \BREG, DISP16(\Index,64)
|
|
.endif
|
|
.endm
|
|
|
|
.macro SAVE4x1
|
|
slwi T10, LDC , 1
|
|
add T1, CO, LDC
|
|
add T2, CO, T10
|
|
add T3, T1, T10
|
|
/*convert alpha_r for multiply*/
|
|
xscvspdp vs4,alpha_r
|
|
/* v0 corresponds to vs32, do not forget*/
|
|
#if !defined(TRMMKERNEL)
|
|
lxssp v0,0(CO)
|
|
lxssp v2,0(T1)
|
|
lxssp v4,0(T2)
|
|
lxssp v6,0(T3)
|
|
#endif
|
|
xscvspdp vs24, vs0
|
|
xxspltw vs25, vs0, 1
|
|
xxspltw vs26, vs0, 2
|
|
xxspltw vs27, vs0, 3
|
|
xscvspdp vs25,vs25
|
|
xscvspdp vs26,vs26
|
|
xscvspdp vs27,vs27
|
|
|
|
#if defined(TRMMKERNEL)
|
|
xsmuldp vs32,vs27, vs4
|
|
xsmuldp vs34,vs26, vs4
|
|
xsmuldp vs36,vs25, vs4
|
|
xsmuldp vs38,vs24, vs4
|
|
#else
|
|
xsmaddadp vs32,vs27, vs4
|
|
xsmaddadp vs34,vs26, vs4
|
|
xsmaddadp vs36,vs25, vs4
|
|
xsmaddadp vs38,vs24, vs4
|
|
#endif
|
|
stxssp v0,0(CO)
|
|
stxssp v2,0(T1)
|
|
stxssp v4,0(T2)
|
|
stxssp v6,0(T3)
|
|
addi CO,CO,4
|
|
.endm
|
|
|
|
/****************************N=2 section*****************/
|
|
|
|
.macro KERNEL2x16_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
|
|
.macro Zero2x16
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2, vs2, vs2
|
|
xxlxor vs3, vs3, vs3
|
|
xxlxor vs4, vs4, vs4
|
|
xxlxor vs5, vs5, vs5
|
|
xxlxor vs6, vs6, vs6
|
|
xxlxor vs7, vs7, vs7
|
|
.endm
|
|
|
|
.macro KERNEL2x16
|
|
KERNEL2x16_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL2x16_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL2x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 1
|
|
xxspltw vs9, vs36, 0
|
|
lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
|
|
lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs27, vs8
|
|
xvmulsp vs2, vs28, vs8
|
|
xvmulsp vs3, vs29, vs8
|
|
|
|
xvmulsp vs4, vs26, vs9
|
|
xvmulsp vs5, vs27, vs9
|
|
xvmulsp vs6, vs28, vs9
|
|
xvmulsp vs7, vs29, vs9
|
|
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs2, vs28, vs8
|
|
xvmaddasp vs3, vs29, vs8
|
|
|
|
xvmaddasp vs4, vs26, vs9
|
|
xvmaddasp vs5, vs27, vs9
|
|
xvmaddasp vs6, vs28, vs9
|
|
xvmaddasp vs7, vs29, vs9
|
|
|
|
.endif
|
|
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL2x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
|
|
|
|
lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
|
|
|
|
lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
|
|
lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
|
|
lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
|
|
lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
|
|
|
|
lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
|
|
lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
|
|
lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
|
|
lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
|
|
|
|
lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
|
|
lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
|
|
lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
|
|
lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
|
|
|
|
xxspltw vs8, vs38, 3
|
|
xxspltw vs9, vs38, 2
|
|
xxspltw vs10, vs38, 1
|
|
xxspltw vs11, vs38, 0
|
|
|
|
xxspltw vs12, vs39, 3
|
|
xxspltw vs13, vs39, 2
|
|
xxspltw vs14, vs39, 1
|
|
xxspltw vs15, vs39, 0
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs2, vs28, vs8
|
|
xvmaddasp vs3, vs29, vs8
|
|
|
|
xvmaddasp vs4, vs26, vs9
|
|
xvmaddasp vs5, vs27, vs9
|
|
xvmaddasp vs6, vs28, vs9
|
|
xvmaddasp vs7, vs29, vs9
|
|
|
|
xvmaddasp vs0, vs16, vs10
|
|
xvmaddasp vs1, vs17, vs10
|
|
xvmaddasp vs2, vs18, vs10
|
|
xvmaddasp vs3, vs19, vs10
|
|
|
|
xvmaddasp vs4, vs16, vs11
|
|
xvmaddasp vs5, vs17, vs11
|
|
xvmaddasp vs6, vs18, vs11
|
|
xvmaddasp vs7, vs19, vs11
|
|
|
|
xvmaddasp vs0, vs30, vs12
|
|
xvmaddasp vs1, vs31, vs12
|
|
xvmaddasp vs2, vs32, vs12
|
|
xvmaddasp vs3, vs33, vs12
|
|
|
|
xvmaddasp vs4, vs30, vs13
|
|
xvmaddasp vs5, vs31, vs13
|
|
xvmaddasp vs6, vs32, vs13
|
|
xvmaddasp vs7, vs33, vs13
|
|
|
|
xvmaddasp vs0, vs34, vs14
|
|
xvmaddasp vs1, vs35, vs14
|
|
xvmaddasp vs2, vs36, vs14
|
|
xvmaddasp vs3, vs37, vs14
|
|
|
|
xvmaddasp vs4, vs34, vs15
|
|
xvmaddasp vs5, vs35, vs15
|
|
xvmaddasp vs6, vs36, vs15
|
|
xvmaddasp vs7, vs37, vs15
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
addi \AREG, \AREG, DISP64(\Index,256)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL2x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 3
|
|
xxspltw vs9, vs36, 2
|
|
xxspltw vs10, vs36, 1
|
|
xxspltw vs11, vs36, 0
|
|
lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
|
|
lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
|
|
lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
|
|
lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
|
|
lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs2, vs28, vs8
|
|
xvmaddasp vs3, vs29, vs8
|
|
|
|
xvmaddasp vs4, vs26, vs9
|
|
xvmaddasp vs5, vs27, vs9
|
|
xvmaddasp vs6, vs28, vs9
|
|
xvmaddasp vs7, vs29, vs9
|
|
|
|
xvmaddasp vs0, vs16, vs10
|
|
xvmaddasp vs1, vs17, vs10
|
|
xvmaddasp vs2, vs18, vs10
|
|
xvmaddasp vs3, vs19, vs10
|
|
|
|
xvmaddasp vs4, vs16, vs11
|
|
xvmaddasp vs5, vs17, vs11
|
|
xvmaddasp vs6, vs18, vs11
|
|
xvmaddasp vs7, vs19, vs11
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
addi \AREG, \AREG, DISP32(\Index,128)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x16
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs16, 0(CO)
|
|
lxv vs17, 16(CO)
|
|
lxv vs18, 32(CO)
|
|
lxv vs19, 48(CO)
|
|
#endif
|
|
add T1, CO, LDC
|
|
#ifndef TRMMKERNEL
|
|
lxv vs26, 0(T1)
|
|
lxv vs27, 16(T1)
|
|
lxv vs28, 32(T1)
|
|
lxv vs29, 48(T1)
|
|
#endif
|
|
|
|
#if defined(TRMMKERNEL)
|
|
xvmulsp vs16, vs0, alpha_r
|
|
xvmulsp vs17, vs1, alpha_r
|
|
xvmulsp vs18, vs2, alpha_r
|
|
xvmulsp vs19, vs3, alpha_r
|
|
xvmulsp vs26, vs4, alpha_r
|
|
xvmulsp vs27, vs5, alpha_r
|
|
xvmulsp vs28, vs6, alpha_r
|
|
xvmulsp vs29, vs7, alpha_r
|
|
#else
|
|
xvmaddasp vs16, vs0, alpha_r
|
|
xvmaddasp vs17, vs1, alpha_r
|
|
xvmaddasp vs18, vs2, alpha_r
|
|
xvmaddasp vs19, vs3, alpha_r
|
|
xvmaddasp vs26, vs4, alpha_r
|
|
xvmaddasp vs27, vs5, alpha_r
|
|
xvmaddasp vs28, vs6, alpha_r
|
|
xvmaddasp vs29, vs7, alpha_r
|
|
#endif
|
|
stxv vs16, 0(CO)
|
|
stxv vs17, 16(CO)
|
|
stxv vs18, 32(CO)
|
|
stxv vs19, 48(CO)
|
|
|
|
stxv vs26, 0(T1)
|
|
stxv vs27, 16(T1)
|
|
stxv vs28, 32(T1)
|
|
stxv vs29, 48(T1)
|
|
|
|
addi CO,CO,64
|
|
|
|
.endm
|
|
|
|
/* M=8 N=2 */
|
|
|
|
.macro KERNEL2x8_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
|
|
.macro Zero2x8
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
|
|
xxlxor vs4, vs4, vs4
|
|
xxlxor vs5, vs5, vs5
|
|
|
|
.endm
|
|
|
|
.macro KERNEL2x8
|
|
KERNEL2x8_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL2x8_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL2x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 1
|
|
xxspltw vs9, vs36, 0
|
|
lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs27, vs8
|
|
|
|
xvmulsp vs4, vs26, vs9
|
|
xvmulsp vs5, vs27, vs9
|
|
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
|
|
xvmaddasp vs4, vs26, vs9
|
|
xvmaddasp vs5, vs27, vs9
|
|
|
|
.endif
|
|
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
addi \AREG, \AREG, DISP8(\Index,32)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL2x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
|
|
|
|
lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
|
|
|
|
lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
|
|
lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
|
|
|
|
lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
|
|
lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
|
|
|
|
lxv vs34, DISP32(\Index, 96+ 0+\OffsetA)(\AREG)
|
|
lxv vs35, DISP32(\Index, 96+ 16+\OffsetA)(\AREG)
|
|
|
|
xxspltw vs8, vs38, 3
|
|
xxspltw vs9, vs38, 2
|
|
xxspltw vs10, vs38, 1
|
|
xxspltw vs11, vs38, 0
|
|
|
|
xxspltw vs12, vs39, 3
|
|
xxspltw vs13, vs39, 2
|
|
xxspltw vs14, vs39, 1
|
|
xxspltw vs15, vs39, 0
|
|
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs4, vs26, vs9
|
|
xvmaddasp vs5, vs27, vs9
|
|
|
|
|
|
xvmaddasp vs0, vs16, vs10
|
|
xvmaddasp vs1, vs17, vs10
|
|
xvmaddasp vs4, vs16, vs11
|
|
xvmaddasp vs5, vs17, vs11
|
|
|
|
|
|
xvmaddasp vs0, vs30, vs12
|
|
xvmaddasp vs1, vs31, vs12
|
|
xvmaddasp vs4, vs30, vs13
|
|
xvmaddasp vs5, vs31, vs13
|
|
|
|
xvmaddasp vs0, vs34, vs14
|
|
xvmaddasp vs1, vs35, vs14
|
|
xvmaddasp vs4, vs34, vs15
|
|
xvmaddasp vs5, vs35, vs15
|
|
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
addi \AREG, \AREG, DISP32(\Index,128)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL2x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 3
|
|
xxspltw vs9, vs36, 2
|
|
xxspltw vs10, vs36, 1
|
|
xxspltw vs11, vs36, 0
|
|
lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs16, DISP16(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs17, DISP16(\Index,48+\OffsetA)(\AREG)
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
|
|
xvmaddasp vs4, vs26, vs9
|
|
xvmaddasp vs5, vs27, vs9
|
|
|
|
xvmaddasp vs0, vs16, vs10
|
|
xvmaddasp vs1, vs17, vs10
|
|
|
|
xvmaddasp vs4, vs16, vs11
|
|
xvmaddasp vs5, vs17, vs11
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x8
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs16, 0(CO)
|
|
lxv vs17, 16(CO)
|
|
#endif
|
|
add T1, CO, LDC
|
|
#ifndef TRMMKERNEL
|
|
lxv vs26, 0(T1)
|
|
lxv vs27, 16(T1)
|
|
|
|
#endif
|
|
|
|
#if defined(TRMMKERNEL)
|
|
xvmulsp vs16, vs0, alpha_r
|
|
xvmulsp vs17, vs1, alpha_r
|
|
xvmulsp vs26, vs4, alpha_r
|
|
xvmulsp vs27, vs5, alpha_r
|
|
#else
|
|
xvmaddasp vs16, vs0, alpha_r
|
|
xvmaddasp vs17, vs1, alpha_r
|
|
xvmaddasp vs26, vs4, alpha_r
|
|
xvmaddasp vs27, vs5, alpha_r
|
|
#endif
|
|
|
|
stxv vs16, 0(CO)
|
|
stxv vs17, 16(CO)
|
|
|
|
|
|
stxv vs26, 0(T1)
|
|
stxv vs27, 16(T1)
|
|
|
|
addi CO,CO,32
|
|
|
|
.endm
|
|
|
|
|
|
/*M=4*/
|
|
|
|
|
|
.macro KERNEL2x4_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
/* we will aggregate on save vs0 +vs4 vs11+vs5 */
|
|
.macro Zero2x4
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
|
|
xxlxor vs4, vs4, vs4
|
|
xxlxor vs5, vs5, vs5
|
|
|
|
.endm
|
|
|
|
.macro KERNEL2x4
|
|
KERNEL2x4_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL2x4_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL2x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 1
|
|
xxspltw vs9, vs36, 0
|
|
lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs26, vs9
|
|
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs26, vs9
|
|
.endif
|
|
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL2x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs38, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs39, DISP8(\Index, 16+\OffsetB)(\BREG)
|
|
|
|
lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs16, DISP16(\Index,16+\OffsetA)(\AREG)
|
|
|
|
lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
|
|
lxv vs34, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
|
|
|
|
|
|
xxspltw vs8, vs38, 3
|
|
xxspltw vs9, vs38, 2
|
|
xxspltw vs10, vs38, 1
|
|
xxspltw vs11, vs38, 0
|
|
|
|
xxspltw vs12, vs39, 3
|
|
xxspltw vs13, vs39, 2
|
|
xxspltw vs14, vs39, 1
|
|
xxspltw vs15, vs39, 0
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs26, vs9
|
|
xvmaddasp vs4, vs16, vs10
|
|
xvmaddasp vs5, vs16, vs11
|
|
|
|
|
|
xvmaddasp vs0, vs30, vs12
|
|
xvmaddasp vs1, vs30, vs13
|
|
xvmaddasp vs4, vs34, vs14
|
|
xvmaddasp vs5, vs34, vs15
|
|
|
|
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL2x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs36, DISP4(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 3
|
|
xxspltw vs9, vs36, 2
|
|
xxspltw vs10, vs36, 1
|
|
xxspltw vs11, vs36, 0
|
|
lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs16, DISP8(\Index, 16+\OffsetA)(\AREG)
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs26, vs9
|
|
xvmaddasp vs4, vs16, vs10
|
|
xvmaddasp vs5, vs16, vs11
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
addi \AREG, \AREG, DISP8(\Index,32)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x4
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs16, 0(CO)
|
|
#endif
|
|
add T1, CO, LDC
|
|
#ifndef TRMMKERNEL
|
|
lxv vs26, 0(T1)
|
|
|
|
#endif
|
|
/*aggregate vectors*/
|
|
xvaddsp vs0,vs0,vs4
|
|
xvaddsp vs1,vs1,vs5
|
|
#if defined(TRMMKERNEL)
|
|
xvmulsp vs16, vs0, alpha_r
|
|
xvmulsp vs26, vs1, alpha_r
|
|
#else
|
|
xvmaddasp vs16, vs0, alpha_r
|
|
xvmaddasp vs26, vs1, alpha_r
|
|
#endif
|
|
|
|
stxv vs16, 0(CO)
|
|
stxv vs26, 0(T1)
|
|
|
|
addi CO,CO,16
|
|
|
|
.endm
|
|
|
|
|
|
/* M=2 N=2 we will have inner pemrute action before permute was revrsing 3,2,1,0 not iw 2ill inner reverse 1,0,3,2 */
|
|
.macro SWITCH_PERMUTE_INNER
|
|
xxpermdi permute_mask, permute_mask, permute_mask,2
|
|
.endm
|
|
|
|
.macro Zero2x2
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
SWITCH_PERMUTE_INNER
|
|
.endm
|
|
|
|
.macro KERNEL2x2
|
|
KERNEL2x2_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL2x2_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL2x2_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL2x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
|
|
xxperm vs9, vs36, permute_mask
|
|
lxsd v5, DISP2(\Index, 0+\OffsetA)(\AREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs37, vs36
|
|
xvmulsp vs1, vs37, vs9
|
|
|
|
.else
|
|
xvmaddasp vs0, vs37, vs36
|
|
xvmaddasp vs1, vs37, vs9
|
|
.endif
|
|
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
addi \AREG, \AREG, DISP2(\Index,8)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL2x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
|
|
|
|
lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs16, DISP8(\Index,16+\OffsetA)(\AREG)
|
|
|
|
|
|
xxperm vs9, vs8, permute_mask
|
|
xxperm vs11, vs10, permute_mask
|
|
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs26, vs9
|
|
xvmaddasp vs0, vs16, vs10
|
|
xvmaddasp vs1, vs16, vs11
|
|
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
addi \AREG, \AREG, DISP8(\Index,32)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL2x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs8, DISP4(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
|
|
|
|
|
|
xxperm vs9, vs8, permute_mask
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs26, vs9
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x2
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxsd v4 , 0(CO)
|
|
#endif
|
|
add T1, CO, LDC
|
|
#ifndef TRMMKERNEL
|
|
lxsd v5 , 0(T1)
|
|
|
|
#endif
|
|
/*aggregate vectors*/
|
|
xxpermdi vs4,vs0,vs0,2
|
|
xxpermdi vs5,vs1,vs1,2
|
|
xvaddsp vs0,vs0,vs4
|
|
xvaddsp vs1,vs1,vs5
|
|
/* */
|
|
/* lets correct the order to 00 10 and 10 ,11 from {00,11} {01,10} */
|
|
xxperm vs1,vs1, permute_mask
|
|
|
|
|
|
xxmrghw vs2 ,vs1,vs0
|
|
xxpermdi vs2,vs2,vs2,2
|
|
xxmrghw vs3 ,vs0,vs1
|
|
#if defined(TRMMKERNEL)
|
|
xvmulsp vs36, vs2, alpha_r
|
|
xvmulsp vs37, vs3, alpha_r
|
|
#else
|
|
xvmaddasp vs36, vs2, alpha_r
|
|
xvmaddasp vs37, vs3, alpha_r
|
|
#endif
|
|
/**** store last two words*/
|
|
|
|
|
|
stxsd v4, 0(CO)
|
|
stxsd v5, 0(T1)
|
|
|
|
addi CO,CO,8
|
|
|
|
.endm
|
|
|
|
/*--------------------------- M=1 N=2 */
|
|
.macro Zero2x1
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2,vs2,vs2
|
|
xxlxor vs3,vs3,vs3
|
|
.endm
|
|
|
|
.macro KERNEL2x1
|
|
KERNEL2x1_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL2x1_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL2x1_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
/*
|
|
we will calculate 1 alone then will add it to batched ones
|
|
*/
|
|
.macro KERNEL2x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxssp v3, DISP2(\Index, 0+\OffsetB)(\BREG)
|
|
lxssp v4, DISP2(\Index, 4+\OffsetB)(\BREG)
|
|
lxssp v5, DISP1(\Index, 0+\OffsetA)(\AREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmulsp vs2, vs37, vs35
|
|
xvmulsp vs3, vs37, vs36
|
|
|
|
.else
|
|
xsmaddadp vs2, vs37, vs35
|
|
xsmaddadp vs3, vs37, vs36
|
|
.endif
|
|
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
addi \AREG, \AREG, DISP1(\Index,4)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL2x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs8, DISP8(\Index, 0+\OffsetB)(\BREG)
|
|
lxv vs10, DISP8(\Index, 16+\OffsetB)(\BREG)
|
|
|
|
lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
|
|
|
|
xxmrglw vs5, vs26,vs26
|
|
xxmrghw vs6, vs26,vs26
|
|
|
|
xvmaddasp vs0, vs8, vs5
|
|
xvmaddasp vs1, vs10, vs6
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL2x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxssp v3, DISP4(\Index, 0+\OffsetB)(\BREG)
|
|
lxssp v4, DISP4(\Index, 4+\OffsetB)(\BREG)
|
|
lxssp v7, DISP4(\Index, 8+\OffsetB)(\BREG)
|
|
lxssp v8, DISP4(\Index, 12+\OffsetB)(\BREG)
|
|
lxssp v5, DISP2(\Index, 0+\OffsetA)(\AREG)
|
|
lxssp v6, DISP2(\Index, 4+\OffsetA)(\AREG)
|
|
|
|
|
|
xsmaddadp vs2, vs37, vs35
|
|
xsmaddadp vs3, vs37, vs36
|
|
|
|
xsmaddadp vs2, vs38, vs39
|
|
xsmaddadp vs3, vs38, vs40
|
|
|
|
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
addi \AREG, \AREG, DISP2(\Index,8)
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x1
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxssp v4 , 0(CO)
|
|
#endif
|
|
add T1, CO, LDC
|
|
#ifndef TRMMKERNEL
|
|
lxssp v5 , 0(T1)
|
|
|
|
#endif
|
|
|
|
/*convert alpha_r for multiply*/
|
|
xscvspdp vs16,alpha_r
|
|
|
|
/*aggregate vectors 2x2_4 */
|
|
xxpermdi vs4,vs0,vs0,2
|
|
xxpermdi vs5,vs1,vs1,2
|
|
xvaddsp vs0,vs0,vs4
|
|
xvaddsp vs1,vs1,vs5
|
|
xvaddsp vs0,vs0,vs1
|
|
/*aggregate vectors 2x1_2 and 2x1_1 into 2x2_4*/
|
|
xscvspdp vs5, vs0
|
|
xxspltw vs6, vs0, 1
|
|
xscvspdp vs6,vs6
|
|
xsadddp vs2,vs2,vs6
|
|
xsadddp vs3,vs3,vs5
|
|
|
|
/**** store last two words*/
|
|
#if defined(TRMMKERNEL)
|
|
xsmuldp vs36,vs2, vs16
|
|
xsmuldp vs37,vs3, vs16
|
|
|
|
#else
|
|
xsmaddadp vs36,vs2, vs16
|
|
xsmaddadp vs37,vs3, vs16
|
|
#endif
|
|
|
|
stxssp v4, 0(CO)
|
|
stxssp v5, 0(T1)
|
|
|
|
addi CO,CO,4
|
|
|
|
.endm
|
|
|
|
|
|
|
|
/****************************N=1 section*****************/
|
|
|
|
.macro KERNEL1x16_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x16_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
|
|
.macro Zero1x16
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2, vs2, vs2
|
|
xxlxor vs3, vs3, vs3
|
|
.endm
|
|
|
|
.macro KERNEL1x16
|
|
KERNEL1x16_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL1x16_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x16_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL1x16_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
|
|
xscvdpspn vs36,vs36
|
|
xxspltw vs8, vs36, 0
|
|
lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs28, DISP16(\Index, 32+\OffsetA)(\AREG)
|
|
lxv vs29, DISP16(\Index,48+\OffsetA)(\AREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs27, vs8
|
|
xvmulsp vs2, vs28, vs8
|
|
xvmulsp vs3, vs29, vs8
|
|
|
|
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs2, vs28, vs8
|
|
xvmaddasp vs3, vs29, vs8
|
|
|
|
.endif
|
|
|
|
addi \BREG, \BREG, DISP1(\Index,4)
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL1x16_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
|
|
|
|
lxv vs26, DISP64(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP64(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs28, DISP64(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs29, DISP64(\Index,48+\OffsetA)(\AREG)
|
|
|
|
lxv vs16, DISP64(\Index,64+ 0+\OffsetA)(\AREG)
|
|
lxv vs17, DISP64(\Index,64+ 16+\OffsetA)(\AREG)
|
|
lxv vs18, DISP64(\Index,64+ 32+\OffsetA)(\AREG)
|
|
lxv vs19, DISP64(\Index,64+ 48+\OffsetA)(\AREG)
|
|
|
|
xxspltw vs8, vs38, 3
|
|
xxspltw vs9, vs38, 2
|
|
|
|
lxv vs30, DISP64(\Index,128+ 0+\OffsetA)(\AREG)
|
|
lxv vs31, DISP64(\Index,128+ 16+\OffsetA)(\AREG)
|
|
lxv vs32, DISP64(\Index,128+ 32+\OffsetA)(\AREG)
|
|
lxv vs33, DISP64(\Index,128+ 48+\OffsetA)(\AREG)
|
|
|
|
lxv vs34, DISP64(\Index,128+ 64+ 0+\OffsetA)(\AREG)
|
|
lxv vs35, DISP64(\Index,128+ 64+ 16+\OffsetA)(\AREG)
|
|
lxv vs36, DISP64(\Index,128+ 64+ 32+\OffsetA)(\AREG)
|
|
lxv vs37, DISP64(\Index,128+ 64+ 48+\OffsetA)(\AREG)
|
|
|
|
xxspltw vs10, vs38, 1
|
|
xxspltw vs11, vs38, 0
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs2, vs28, vs8
|
|
xvmaddasp vs3, vs29, vs8
|
|
|
|
|
|
xvmaddasp vs0, vs16, vs9
|
|
xvmaddasp vs1, vs17, vs9
|
|
xvmaddasp vs2, vs18, vs9
|
|
xvmaddasp vs3, vs19, vs9
|
|
|
|
|
|
xvmaddasp vs0, vs30, vs10
|
|
xvmaddasp vs1, vs31, vs10
|
|
xvmaddasp vs2, vs32, vs10
|
|
xvmaddasp vs3, vs33, vs10
|
|
|
|
|
|
xvmaddasp vs0, vs34, vs11
|
|
xvmaddasp vs1, vs35, vs11
|
|
xvmaddasp vs2, vs36, vs11
|
|
xvmaddasp vs3, vs37, vs11
|
|
|
|
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
addi \AREG, \AREG, DISP64(\Index,256)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL1x16_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 1
|
|
xxspltw vs9, vs36, 0
|
|
lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs28, DISP32(\Index,32+\OffsetA)(\AREG)
|
|
lxv vs29, DISP32(\Index,48+\OffsetA)(\AREG)
|
|
lxv vs16, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
|
|
lxv vs17, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
|
|
lxv vs18, DISP32(\Index,64+ 32+\OffsetA)(\AREG)
|
|
lxv vs19, DISP32(\Index,64+ 48+\OffsetA)(\AREG)
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
xvmaddasp vs2, vs28, vs8
|
|
xvmaddasp vs3, vs29, vs8
|
|
|
|
|
|
xvmaddasp vs0, vs16, vs9
|
|
xvmaddasp vs1, vs17, vs9
|
|
xvmaddasp vs2, vs18, vs9
|
|
xvmaddasp vs3, vs19, vs9
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
addi \AREG, \AREG, DISP32(\Index,128)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x16
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs16, 0(CO)
|
|
lxv vs17, 16(CO)
|
|
lxv vs18, 32(CO)
|
|
lxv vs19, 48(CO)
|
|
#endif
|
|
|
|
|
|
#if defined(TRMMKERNEL)
|
|
xvmulsp vs16, vs0, alpha_r
|
|
xvmulsp vs17, vs1, alpha_r
|
|
xvmulsp vs18, vs2, alpha_r
|
|
xvmulsp vs19, vs3, alpha_r
|
|
#else
|
|
xvmaddasp vs16, vs0, alpha_r
|
|
xvmaddasp vs17, vs1, alpha_r
|
|
xvmaddasp vs18, vs2, alpha_r
|
|
xvmaddasp vs19, vs3, alpha_r
|
|
#endif
|
|
stxv vs16, 0(CO)
|
|
stxv vs17, 16(CO)
|
|
stxv vs18, 32(CO)
|
|
stxv vs19, 48(CO)
|
|
|
|
addi CO,CO,64
|
|
|
|
.endm
|
|
|
|
/* M=8 N=1 */
|
|
|
|
.macro KERNEL1x8_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x8_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
|
|
.macro Zero1x8
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2, vs2, vs2
|
|
xxlxor vs3, vs3, vs3
|
|
.endm
|
|
|
|
.macro KERNEL1x8
|
|
KERNEL1x8_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL1x8_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x8_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL1x8_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
|
|
xscvdpspn vs36,vs36
|
|
xxspltw vs8, vs36, 0
|
|
lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
xvmulsp vs1, vs27, vs8
|
|
|
|
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
|
|
.endif
|
|
|
|
addi \BREG, \BREG, DISP1(\Index,4)
|
|
addi \AREG, \AREG, DISP8(\Index,32)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL1x8_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
|
|
|
|
lxv vs26, DISP32(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP32(\Index,16+\OffsetA)(\AREG)
|
|
|
|
lxv vs16, DISP32(\Index,32+ 0+\OffsetA)(\AREG)
|
|
lxv vs17, DISP32(\Index,32+ 16+\OffsetA)(\AREG)
|
|
|
|
xxspltw vs8, vs38, 3
|
|
xxspltw vs9, vs38, 2
|
|
|
|
lxv vs30, DISP32(\Index,64+ 0+\OffsetA)(\AREG)
|
|
lxv vs31, DISP32(\Index,64+ 16+\OffsetA)(\AREG)
|
|
|
|
lxv vs34, DISP32(\Index,64+ 32+ 0+\OffsetA)(\AREG)
|
|
lxv vs35, DISP32(\Index,64+ 32+ 16+\OffsetA)(\AREG)
|
|
|
|
xxspltw vs10, vs38, 1
|
|
xxspltw vs11, vs38, 0
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
|
|
|
|
xvmaddasp vs2, vs16, vs9
|
|
xvmaddasp vs3, vs17, vs9
|
|
|
|
|
|
xvmaddasp vs0, vs30, vs10
|
|
xvmaddasp vs1, vs31, vs10
|
|
|
|
|
|
xvmaddasp vs2, vs34, vs11
|
|
xvmaddasp vs3, vs35, vs11
|
|
|
|
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
addi \AREG, \AREG, DISP32(\Index,128)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL1x8_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 1
|
|
xxspltw vs9, vs36, 0
|
|
lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
|
|
lxv vs16, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
|
|
lxv vs17, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs8
|
|
|
|
|
|
xvmaddasp vs2, vs16, vs9
|
|
xvmaddasp vs3, vs17, vs9
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x8
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs16, 0(CO)
|
|
lxv vs17, 16(CO)
|
|
#endif
|
|
/* aggregate vs0 vs2 and vs1 vs3*/
|
|
xvaddsp vs0,vs0,vs2
|
|
xvaddsp vs1,vs1,vs3
|
|
#if defined(TRMMKERNEL)
|
|
xvmulsp vs16, vs0, alpha_r
|
|
xvmulsp vs17, vs1, alpha_r
|
|
#else
|
|
xvmaddasp vs16, vs0, alpha_r
|
|
xvmaddasp vs17, vs1, alpha_r
|
|
#endif
|
|
stxv vs16, 0(CO)
|
|
stxv vs17, 16(CO)
|
|
|
|
addi CO,CO,32
|
|
|
|
.endm
|
|
/*M=4*/
|
|
|
|
.macro KERNEL1x4_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x4_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
|
|
.macro Zero1x4
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2, vs2, vs2
|
|
xxlxor vs3, vs3, vs3
|
|
.endm
|
|
|
|
.macro KERNEL1x4
|
|
KERNEL1x4_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL1x4_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x4_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL1x4_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxssp v4, DISP1(\Index, 0+\OffsetB)(\BREG)
|
|
xscvdpspn vs36,vs36
|
|
xxspltw vs8, vs36, 0
|
|
lxv vs26, DISP4(\Index, 0+\OffsetA)(\AREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmulsp vs0, vs26, vs8
|
|
.else
|
|
xvmaddasp vs0, vs26, vs8
|
|
|
|
.endif
|
|
|
|
addi \BREG, \BREG, DISP1(\Index,4)
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL1x4_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs38, DISP4(\Index, 0+\OffsetB)(\BREG)
|
|
|
|
lxv vs26, DISP16(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP16(\Index,16+\OffsetA)(\AREG)
|
|
|
|
|
|
xxspltw vs8, vs38, 3
|
|
xxspltw vs9, vs38, 2
|
|
|
|
lxv vs30, DISP16(\Index,32+ 0+\OffsetA)(\AREG)
|
|
lxv vs31, DISP16(\Index,32+ 16+\OffsetA)(\AREG)
|
|
|
|
|
|
xxspltw vs10, vs38, 1
|
|
xxspltw vs11, vs38, 0
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
|
|
xvmaddasp vs1, vs27, vs9
|
|
|
|
xvmaddasp vs2, vs30, vs10
|
|
|
|
|
|
xvmaddasp vs3, vs31, vs11
|
|
|
|
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL1x4_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetB)(\BREG)
|
|
xxspltw vs8, vs36, 1
|
|
xxspltw vs9, vs36, 0
|
|
lxv vs26, DISP8(\Index, 0+\OffsetA)(\AREG)
|
|
lxv vs27, DISP8(\Index,16+\OffsetA)(\AREG)
|
|
|
|
|
|
xvmaddasp vs0, vs26, vs8
|
|
xvmaddasp vs1, vs27, vs9
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
addi \AREG, \AREG, DISP8(\Index,32)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x4
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxv vs16, 0(CO)
|
|
#endif
|
|
/* aggregate */
|
|
xvaddsp vs0,vs0,vs2
|
|
xvaddsp vs1,vs1,vs3
|
|
xvaddsp vs0,vs1,vs0
|
|
#if defined(TRMMKERNEL)
|
|
xvmulsp vs16, vs0, alpha_r
|
|
#else
|
|
xvmaddasp vs16, vs0, alpha_r
|
|
#endif
|
|
stxv vs16, 0(CO)
|
|
|
|
addi CO,CO,16
|
|
|
|
.endm
|
|
|
|
/* M=2 N=1*/
|
|
.macro Zero1x2
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2,vs2,vs2
|
|
xxlxor vs3,vs3,vs3
|
|
.endm
|
|
|
|
.macro KERNEL1x2
|
|
KERNEL1x2_1 AO,BO, 0, 0,0,0
|
|
.endm
|
|
.macro KERNEL1x2_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x2_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL1x2_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x2_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
/*
|
|
we will calculate 1 alone then will add it to batched ones
|
|
*/
|
|
.macro KERNEL1x2_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxssp v3, DISP2(\Index, 0+\OffsetB)(\AREG)
|
|
lxssp v4, DISP2(\Index, 4+\OffsetB)(\AREG)
|
|
lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmuldp vs2, vs37, vs35
|
|
xvmuldp vs3, vs37, vs36
|
|
|
|
.else
|
|
xsmaddadp vs2, vs37, vs35
|
|
xsmaddadp vs3, vs37, vs36
|
|
.endif
|
|
|
|
addi \AREG, \AREG, DISP2(\Index,8)
|
|
addi \BREG, \BREG, DISP1(\Index,4)
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
.macro KERNEL1x2_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
|
|
lxv vs10, DISP8(\Index, 16+\OffsetB)(\AREG)
|
|
|
|
lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
|
|
|
|
xxmrglw vs5, vs26,vs26
|
|
xxmrghw vs6, vs26,vs26
|
|
|
|
xvmaddasp vs0, vs8, vs5
|
|
xvmaddasp vs1, vs10, vs6
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \AREG, \AREG, DISP8(\Index,32)
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL1x2_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxssp v3, DISP4(\Index, 0+\OffsetB)(\AREG)
|
|
lxssp v4, DISP4(\Index, 4+\OffsetB)(\AREG)
|
|
lxssp v7, DISP4(\Index, 8+\OffsetB)(\AREG)
|
|
lxssp v8, DISP4(\Index, 12+\OffsetB)(\AREG)
|
|
lxssp v5, DISP2(\Index, 0+\OffsetA)(\BREG)
|
|
lxssp v6, DISP2(\Index, 4+\OffsetA)(\BREG)
|
|
|
|
|
|
xsmaddadp vs2, vs37, vs35
|
|
xsmaddadp vs3, vs37, vs36
|
|
|
|
xsmaddadp vs2, vs38, vs39
|
|
xsmaddadp vs3, vs38, vs40
|
|
|
|
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x2
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxssp v4 , 0(CO)
|
|
lxssp v5 , 4(CO)
|
|
|
|
#endif
|
|
|
|
/*convert alpha_r for multiply*/
|
|
xscvspdp vs16,alpha_r
|
|
|
|
/*aggregate vectors 1x2_4 */
|
|
xxpermdi vs4,vs0,vs0,2
|
|
xxpermdi vs5,vs1,vs1,2
|
|
xvaddsp vs0,vs0,vs4
|
|
xvaddsp vs1,vs1,vs5
|
|
xvaddsp vs0,vs0,vs1
|
|
/*aggregate vectors 1x1_2 and 1x1_1 into 1x2_4*/
|
|
xscvspdp vs5, vs0
|
|
xxspltw vs6, vs0, 1
|
|
xscvspdp vs6,vs6
|
|
xsadddp vs2,vs2,vs6
|
|
xsadddp vs3,vs3,vs5
|
|
|
|
/**** store last two words*/
|
|
#if defined(TRMMKERNEL)
|
|
xsmuldp vs36,vs2, vs16
|
|
xsmuldp vs37,vs3, vs16
|
|
|
|
#else
|
|
xsmaddadp vs36,vs2, vs16
|
|
xsmaddadp vs37,vs3, vs16
|
|
#endif
|
|
|
|
stxssp v4, 0(CO)
|
|
stxssp v5, 4(CO)
|
|
|
|
addi CO,CO,8
|
|
|
|
.endm
|
|
/*///////////////// N=1 M=1 //////////////////*/
|
|
.macro Zero1x1
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2, vs2,vs2
|
|
xxlxor vs3,vs3,vs3
|
|
xxlxor vs4,vs4,vs4
|
|
.endm
|
|
|
|
.macro KERNEL1x1
|
|
KERNEL1x1_1 AO,BO, 1, 0,0,0
|
|
.endm
|
|
|
|
.macro KERNEL1x1_16 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x1_I_16 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL1x1_8 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x1_I_8 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL1x1_4 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x1_I_4 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
|
|
.macro KERNEL1x1_2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x1_I_2 AO,BO, \OffsetA,\OffsetB,\Index,\IsLast
|
|
.endm
|
|
/*
|
|
we will calculate 1 alone ( FIRST==1 to zero vs4)
|
|
*/
|
|
.macro KERNEL1x1_1 AREG,BREG,First,OffsetA,OffsetB,Index
|
|
|
|
|
|
lxssp v3, DISP1(\Index, 0+\OffsetB)(\AREG)
|
|
lxssp v5, DISP1(\Index, 0+\OffsetA)(\BREG)
|
|
|
|
|
|
.if \First==1
|
|
xvmuldp vs4, vs37, vs35
|
|
|
|
.else
|
|
xsmaddadp vs4, vs37, vs35
|
|
.endif
|
|
|
|
addi \AREG, \AREG, DISP1(\Index,4)
|
|
addi \BREG, \BREG, DISP1(\Index,4)
|
|
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x1_I_16 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs8, DISP16(\Index, 0+\OffsetB)(\AREG)
|
|
lxv vs9, DISP16(\Index, 16+\OffsetB)(\AREG)
|
|
lxv vs10, DISP16(\Index, 32+0+\OffsetB)(\AREG)
|
|
lxv vs11, DISP16(\Index, 32+ 16+\OffsetB)(\AREG)
|
|
lxv vs26, DISP16(\Index, 0+\OffsetA)(\BREG)
|
|
lxv vs16, DISP16(\Index, 16+\OffsetA)(\BREG)
|
|
lxv vs17, DISP16(\Index, 32+0+\OffsetA)(\BREG)
|
|
lxv vs18, DISP16(\Index, 32+16+\OffsetA)(\BREG)
|
|
xvmaddasp vs0, vs8, vs26
|
|
xvmaddasp vs1, vs9, vs16
|
|
xvmaddasp vs2, vs10, vs17
|
|
xvmaddasp vs3, vs11, vs18
|
|
.if \IsLast==1
|
|
addi \AREG, \AREG, DISP16(\Index,64)
|
|
addi \BREG, \BREG, DISP16(\Index,64)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL1x1_I_8 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs8, DISP8(\Index, 0+\OffsetB)(\AREG)
|
|
lxv vs9, DISP8(\Index, 16+\OffsetB)(\AREG)
|
|
lxv vs26, DISP8(\Index, 0+\OffsetA)(\BREG)
|
|
lxv vs16, DISP8(\Index, 16+\OffsetA)(\BREG)
|
|
xvmaddasp vs0, vs8, vs26
|
|
xvmaddasp vs1, vs9, vs16
|
|
|
|
.if \IsLast==1
|
|
addi \AREG, \AREG, DISP8(\Index,32)
|
|
addi \BREG, \BREG, DISP8(\Index,32)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x1_I_4 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxv vs8, DISP4(\Index, 0+\OffsetB)(\AREG)
|
|
lxv vs26, DISP4(\Index, 0+\OffsetA)(\BREG)
|
|
|
|
xvmaddasp vs0, vs8, vs26
|
|
|
|
|
|
.if \IsLast==1
|
|
addi \AREG, \AREG, DISP4(\Index,16)
|
|
addi \BREG, \BREG, DISP4(\Index,16)
|
|
.endif
|
|
|
|
.endm
|
|
|
|
.macro KERNEL1x1_I_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast
|
|
|
|
lxsd v4, DISP2(\Index, 0+\OffsetB)(\AREG)
|
|
lxsd v5, DISP2(\Index, 0+\OffsetA)(\BREG)
|
|
|
|
xvmaddasp vs0, vs36, vs37
|
|
|
|
addi \AREG, \AREG, DISP2(\Index,8)
|
|
addi \BREG, \BREG, DISP2(\Index,8)
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x1
|
|
|
|
#ifndef TRMMKERNEL
|
|
lxssp v4 , 0(CO)
|
|
|
|
#endif
|
|
|
|
/*convert alpha_r for multiply*/
|
|
xscvspdp vs16,alpha_r
|
|
|
|
/*aggregate vectors */
|
|
xvaddsp vs0,vs0,vs1
|
|
xvaddsp vs2,vs2,vs3
|
|
xvaddsp vs0,vs0,vs2
|
|
|
|
xxpermdi vs7,vs0,vs0,2
|
|
xvaddsp vs0,vs0,vs7
|
|
/*aggregate vectors 1x1_2 and 1x1_1 into 1x1_4*/
|
|
xscvspdp vs5, vs0
|
|
xxspltw vs6, vs0, 1
|
|
xscvspdp vs6,vs6
|
|
xsadddp vs7,vs5,vs6
|
|
xsadddp vs4,vs4,vs7
|
|
|
|
/**** store last two words*/
|
|
#if defined(TRMMKERNEL)
|
|
xsmuldp vs36,vs4, vs16
|
|
|
|
#else
|
|
xsmaddadp vs36,vs4, vs16
|
|
#endif
|
|
|
|
stxssp v4, 0(CO)
|
|
|
|
addi CO,CO,4
|
|
|
|
.endm
|
|
|
|
|
|
|
|
|
|
/****************************TRMM POINTER REFRESH MACROSES*************************/
|
|
|
|
.macro SHIFT_REG REG1,REG2,SHIFT_VAL
|
|
.if \SHIFT_VAL==16
|
|
slwi \REG1, \REG2, 6
|
|
.elseif \SHIFT_VAL==8
|
|
slwi \REG1, \REG2, 5
|
|
.elseif \SHIFT_VAL==4
|
|
slwi \REG1, \REG2, 4
|
|
.elseif \SHIFT_VAL==2
|
|
slwi \REG1, \REG2, 3
|
|
.elseif \SHIFT_VAL==1
|
|
slwi \REG1, \REG2, 2
|
|
.endif
|
|
.endm
|
|
|
|
/*
|
|
//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
// ptrbb = bb;
|
|
// #else
|
|
// ptrba += off*16;
|
|
// ptrbb = bb + off*2;
|
|
// #endif
|
|
*/
|
|
.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
/* ptrbb = bb;*/
|
|
mr \PTR_B,\B_VAL /* refresh BPOINT */
|
|
|
|
#else
|
|
/*
|
|
// ptrba =ptrba+ off*C_A;
|
|
// ptrbb = bb + off*C_B;
|
|
*/
|
|
SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
|
|
SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
|
|
add \PTR_B, \B_VAL , T4 /* Add values to BO */
|
|
add \PTR_A, \PTR_A, T2 /* Add values to AO */
|
|
#endif
|
|
.endm
|
|
|
|
|
|
/*
|
|
// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
// temp = bk-off;
|
|
// #elif defined(LEFT)
|
|
// temp = off+16; // number of values in A
|
|
// #else
|
|
// temp = off+2; // number of values in B
|
|
// #endif
|
|
*/
|
|
.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
/* temp = bk-off;*/
|
|
sub \TEMP_BK,\BK_VAL,\OFF_VAL
|
|
|
|
#elif defined(LEFT)
|
|
/* temp = off+INCR_A; // number of values in A */
|
|
addi \TEMP_BK, \OFF_VAL, \INCR_A
|
|
#else
|
|
/* temp = off+INCR_B // number of values in B*/
|
|
addi \TEMP_BK,\OFF_VAL, \INCR_B
|
|
#endif
|
|
|
|
.endm
|
|
/*
|
|
// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
// temp = bk - off;
|
|
// #ifdef LEFT
|
|
// temp -= 16; // number of values in A
|
|
// #else
|
|
// temp -= 2; // number of values in B
|
|
// #endif
|
|
// ptrba += temp*16;
|
|
// ptrbb += temp*2;
|
|
// #endif
|
|
|
|
// #ifdef LEFT
|
|
// off += 16; // number of values in A
|
|
// #endif
|
|
*/
|
|
|
|
|
|
.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
|
|
|
|
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
/*temp = bk - off;*/
|
|
sub \TEMP_BK,\BK_VAL,\OFF_VAL
|
|
#ifdef LEFT
|
|
/*temp -= 8; // number of values in A*/
|
|
addi \TEMP_BK,\TEMP_BK,-\C_A
|
|
#else
|
|
/*temp -= 4; // number of values in B*/
|
|
addi \TEMP_BK,\TEMP_BK,-\C_B
|
|
#endif
|
|
/*ptrba += temp*C_A;
|
|
ptrbb += temp*C_B;*/
|
|
SHIFT_REG T4,\TEMP_BK,\C_A
|
|
SHIFT_REG T2,\TEMP_BK,\C_B
|
|
add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
|
|
add \PTR_B, \PTR_B,T2
|
|
|
|
#endif
|
|
|
|
#ifdef LEFT
|
|
/*off += 8; // number of values in A*/
|
|
addi \OFF_VAL,\OFF_VAL,\C_A
|
|
#endif
|
|
.endm |