This patch introduces new optimized version of ZGEMM kernel using power10 Matrix-Multiply Assist (MMA) feature introduced in POWER ISA v3.1. This patch makes use of new POWER10 compute instructions for matrix multiplication operation. Tested on simulator and there are no new test failures. Cycles count reduced by 30-50% compared to POWER9 version depending on M/N/K sizes.
1139 lines
32 KiB
ArmAsm
1139 lines
32 KiB
ArmAsm
/***************************************************************************
|
|
Copyright (c) 2013-2020, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*****************************************************************************/
|
|
|
|
#define unit_size 16
|
|
#define DISP32(ind,disp) (ind*unit_size*32+disp)
|
|
#define DISP16(ind,disp) (ind*unit_size*16+disp)
|
|
#define DISP8(ind,disp) (ind*unit_size*8+disp)
|
|
#define DISP4(ind,disp) (ind*unit_size*4+disp)
|
|
#define DISP2(ind,disp) (ind*unit_size*2+disp)
|
|
#define DISP1(ind,disp) (ind*unit_size+disp)
|
|
#define DISPX(disp) (disp)
|
|
/* HELPERS FOR SAVE */
|
|
/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
|
|
|
|
|
|
.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
|
|
#ifndef TRMMKERNEL
|
|
lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
|
|
lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
|
|
xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
|
|
xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
|
|
#endif
|
|
.endm
|
|
/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
|
|
|
|
|
|
.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
|
|
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
|
|
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
|
|
.endm
|
|
/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
|
|
|
|
|
|
.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
|
|
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
|
|
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
|
|
.endm
|
|
/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
|
|
|
|
|
|
.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
|
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
|
|
xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
|
|
#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
|
|
xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
|
|
xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
|
|
#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
|
|
xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
|
|
xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
|
|
#else // CC || CR || RC || RR
|
|
/*we will assume {-alpha_r,-alpha_i} for this case */
|
|
/*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
|
|
xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
|
|
/*we will negate alpha image instead instead to fix sign*/
|
|
xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
|
|
#endif
|
|
.endm
|
|
/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
|
|
|
|
|
|
.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
|
|
#ifndef TRMMKERNEL
|
|
xvmsubadp \VSOUT1,\VSINII, alpha_i
|
|
xvmaddadp \VSOUT2,\VSINRR, alpha_i
|
|
#else
|
|
xvmuldp \VSOUT1,\VSINII, alpha_i
|
|
xvmuldp \VSOUT2,\VSINRR, alpha_i
|
|
#endif
|
|
.endm
|
|
/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
|
|
|
|
|
|
.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
|
|
xvmsubadp \VSOUT1,\VSINRR, alpha_r
|
|
xvmaddadp \VSOUT2,\VSINII, alpha_r
|
|
.endm
|
|
/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
|
|
|
|
|
|
.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
|
|
xxmrghd \VSOUT1,\VSIN2,\VSIN1
|
|
xxmrgld \VSOUT2,\VSIN2,\VSIN1
|
|
.endm
|
|
|
|
|
|
.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
|
|
stxv \VSIN1, DISPX(\LOFFSET)(\REG)
|
|
stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
|
|
.endm
|
|
|
|
|
|
.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
|
|
LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
|
|
LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
|
|
LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64)
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
|
|
LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96)
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43
|
|
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45
|
|
AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
|
|
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
|
|
MULT_APLHA_PART1 vs38,vs40,vs48,vs49
|
|
MULT_APLHA_PART2 vs34,vs36,vs46,vs47
|
|
AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45
|
|
MULT_APLHA_PART2 vs38,vs40,vs48,vs49
|
|
AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
|
|
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
|
|
MULT_APLHA_PART1 vs42,vs44, vs56,vs57
|
|
UNPACK_FOR_STORE vs48,vs49,vs35,vs37
|
|
MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59
|
|
STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
|
|
MULT_APLHA_PART2 vs42,vs44,vs56,vs57
|
|
STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
|
|
MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59
|
|
UNPACK_FOR_STORE vs56,vs57,vs42,vs44
|
|
UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3
|
|
STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44
|
|
STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
|
|
.endm
|
|
|
|
|
|
.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
|
|
LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
|
|
LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
|
|
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
|
|
AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
|
|
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
|
|
MULT_APLHA_PART1 vs38,vs40, vs48,vs49
|
|
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
|
|
MULT_APLHA_PART2 vs38,vs40,vs48,vs49
|
|
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
|
|
UNPACK_FOR_STORE vs48,vs49,vs35,vs37
|
|
STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
|
|
STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
|
|
.endm
|
|
|
|
|
|
.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
|
|
LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
|
|
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
|
|
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
|
|
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
|
|
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
|
|
STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
|
|
.endm
|
|
|
|
|
|
.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
|
|
#ifndef TRMMKERNEL
|
|
lxv vs50, (\LOFFSET)(\BASE_REG)
|
|
xxmrgld vs46,vs50,vs50
|
|
xxmrghd vs47,vs50,vs50
|
|
#endif
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37
|
|
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
|
|
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
|
|
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
|
|
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
|
|
xxmrghd vs39,vs47,vs46
|
|
stxv vs39, (\LOFFSET)(\BASE_REG)
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=2 and M=8
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL2x8_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
xxsetaccz 2
|
|
xxsetaccz 3
|
|
xxsetaccz 4
|
|
xxsetaccz 5
|
|
xxsetaccz 6
|
|
xxsetaccz 7
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x8_PRELOAD
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs34, 32(AO) // load real,imag from A
|
|
lxvp vs36, 64(AO) // load real,imag from A
|
|
lxvp vs38, 96(AO) // load real,imag from A
|
|
lxvp vs48, 0(BO) // load real imag from B
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x8_2 Index, IsLast
|
|
lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A
|
|
lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A
|
|
lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A
|
|
lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A
|
|
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs36, vs49
|
|
xvf64gerpp 3, vs38, vs49
|
|
xvf64gerpp 4, vs32, vs48
|
|
xvf64gerpp 5, vs34, vs48
|
|
xvf64gerpp 6, vs36, vs48
|
|
xvf64gerpp 7, vs38, vs48
|
|
lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A
|
|
lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A
|
|
lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A
|
|
lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A
|
|
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
|
|
xvf64gerpp 0, vs40, vs51
|
|
xvf64gerpp 1, vs42, vs51
|
|
xvf64gerpp 2, vs44, vs51
|
|
xvf64gerpp 3, vs46, vs51
|
|
xvf64gerpp 4, vs40, vs50
|
|
xvf64gerpp 5, vs42, vs50
|
|
xvf64gerpp 6, vs44, vs50
|
|
xvf64gerpp 7, vs46, vs50
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP16(\Index,256)
|
|
addi BO, BO, DISP4(\Index,64)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_2x8 OffsetA,OffsetB
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs36, vs49
|
|
xvf64gerpp 3, vs38, vs49
|
|
xvf64gerpp 4, vs32, vs48
|
|
xvf64gerpp 5, vs34, vs48
|
|
xvf64gerpp 6, vs36, vs48
|
|
xvf64gerpp 7, vs38, vs48
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x8_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
xxmfacc 2
|
|
xxmfacc 3
|
|
xxmfacc 4
|
|
xxmfacc 5
|
|
xxmfacc 6
|
|
xxmfacc 7
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x8
|
|
add T1, CO ,LDC
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
xxpermdi vs40, vs8, vs9, 0b01
|
|
xxpermdi vs41, vs8, vs9, 0b10
|
|
xxpermdi vs42, vs10, vs11, 0b01
|
|
xxpermdi vs43, vs10, vs11, 0b10
|
|
xxpermdi vs44, vs12, vs13, 0b01
|
|
xxpermdi vs45, vs12, vs13, 0b10
|
|
xxpermdi vs46, vs14, vs15, 0b01
|
|
xxpermdi vs47, vs14, vs15, 0b10
|
|
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
xxlor vs10, vs40, vs40
|
|
xxlor vs11, vs41, vs41
|
|
xxlor vs8, vs42, vs42
|
|
xxlor vs9, vs43, vs43
|
|
xxlor vs14, vs44, vs44
|
|
xxlor vs15, vs45, vs45
|
|
xxlor vs12, vs46, vs46
|
|
xxlor vs13, vs47, vs47
|
|
|
|
xxpermdi vs32, vs16, vs17, 0b01
|
|
xxpermdi vs33, vs16, vs17, 0b10
|
|
xxpermdi vs34, vs18, vs19, 0b01
|
|
xxpermdi vs35, vs18, vs19, 0b10
|
|
xxpermdi vs36, vs20, vs21, 0b01
|
|
xxpermdi vs37, vs20, vs21, 0b10
|
|
xxpermdi vs38, vs22, vs23, 0b01
|
|
xxpermdi vs39, vs22, vs23, 0b10
|
|
xxpermdi vs40, vs24, vs25, 0b01
|
|
xxpermdi vs41, vs24, vs25, 0b10
|
|
xxpermdi vs42, vs26, vs27, 0b01
|
|
xxpermdi vs43, vs26, vs27, 0b10
|
|
xxpermdi vs44, vs28, vs29, 0b01
|
|
xxpermdi vs45, vs28, vs29, 0b10
|
|
xxpermdi vs46, vs30, vs31, 0b01
|
|
xxpermdi vs47, vs30, vs31, 0b10
|
|
|
|
xxlor vs18, vs32, vs32
|
|
xxlor vs19, vs33, vs33
|
|
xxlor vs16, vs34, vs34
|
|
xxlor vs17, vs35, vs35
|
|
xxlor vs22, vs36, vs36
|
|
xxlor vs23, vs37, vs37
|
|
xxlor vs20, vs38, vs38
|
|
xxlor vs21, vs39, vs39
|
|
xxlor vs26, vs40, vs40
|
|
xxlor vs27, vs41, vs41
|
|
xxlor vs24, vs42, vs42
|
|
xxlor vs25, vs43, vs43
|
|
xxlor vs30, vs44, vs44
|
|
xxlor vs31, vs45, vs45
|
|
xxlor vs28, vs46, vs46
|
|
xxlor vs29, vs47, vs47
|
|
|
|
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
|
|
SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0
|
|
addi CO, CO, 128
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=2 and M=4
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL2x4_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
xxsetaccz 2
|
|
xxsetaccz 3
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x4_PRELOAD
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs34, 32(AO) // load real,imag from A
|
|
lxvp vs48, 0(BO) // load real imag from B
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x4_2 Index, IsLast
|
|
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
|
|
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
|
|
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs32, vs48
|
|
xvf64gerpp 3, vs34, vs48
|
|
lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A
|
|
lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A
|
|
lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B
|
|
xvf64gerpp 0, vs40, vs51
|
|
xvf64gerpp 1, vs42, vs51
|
|
xvf64gerpp 2, vs40, vs50
|
|
xvf64gerpp 3, vs42, vs50
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP8(\Index,128)
|
|
addi BO, BO, DISP4(\Index,64)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_2x4 OffsetA, OffsetB
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs32, vs48
|
|
xvf64gerpp 3, vs34, vs48
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x4_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
xxmfacc 2
|
|
xxmfacc 3
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x4
|
|
add T1, CO ,LDC
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
xxpermdi vs40, vs8, vs9, 0b01
|
|
xxpermdi vs41, vs8, vs9, 0b10
|
|
xxpermdi vs42, vs10, vs11, 0b01
|
|
xxpermdi vs43, vs10, vs11, 0b10
|
|
xxpermdi vs44, vs12, vs13, 0b01
|
|
xxpermdi vs45, vs12, vs13, 0b10
|
|
xxpermdi vs46, vs14, vs15, 0b01
|
|
xxpermdi vs47, vs14, vs15, 0b10
|
|
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
xxlor vs10, vs40, vs40
|
|
xxlor vs11, vs41, vs41
|
|
xxlor vs8, vs42, vs42
|
|
xxlor vs9, vs43, vs43
|
|
xxlor vs14, vs44, vs44
|
|
xxlor vs15, vs45, vs45
|
|
xxlor vs12, vs46, vs46
|
|
xxlor vs13, vs47, vs47
|
|
|
|
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
|
|
SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0
|
|
addi CO, CO, 64
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=2 and M=2
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL2x2_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x2_PRELOAD
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs48, 0(BO) // load real imag from B
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x2_2 Index, IsLast
|
|
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
|
|
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs32, vs48
|
|
lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A
|
|
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
|
|
xvf64gerpp 0, vs40, vs51
|
|
xvf64gerpp 1, vs40, vs50
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP4(\Index,64)
|
|
addi BO, BO, DISP4(\Index,64)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_2x2 OffsetA,OffsetB
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs32, vs48
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x2_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x2
|
|
add T1, CO ,LDC
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
|
|
SAVE2 vs0,vs1,vs2,vs3,CO,0
|
|
SAVE2 vs4,vs5,vs6,vs7,T1,0
|
|
addi CO, CO, 32
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=2 and M=1
|
|
**********************************************************************************************/
|
|
|
|
.macro ZERO2x1
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2, vs2, vs2
|
|
xxlxor vs3, vs3, vs3
|
|
|
|
.endm
|
|
|
|
|
|
.macro LOAD2x1
|
|
LOAD2x1O 0,0
|
|
.endm
|
|
|
|
|
|
.macro LOAD2x1O OffsetA,OffsetB
|
|
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
|
|
lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
|
|
xxswapd vs49, vs48
|
|
xxswapd vs51, vs50
|
|
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
|
|
.endm
|
|
|
|
|
|
.macro END2x1_WITHOUT_ADD
|
|
END2x1 AO,BO,0,0
|
|
.endm
|
|
|
|
|
|
.macro END2x1 AREG, BREG, OffsetA, OffsetB
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
xvmaddadp vs0, vs32, vs48
|
|
xvmaddadp vs2, vs32, vs50
|
|
xvmaddadp vs1, vs32, vs49
|
|
xvmaddadp vs3, vs32, vs51
|
|
.endm
|
|
|
|
|
|
.macro LOAD2x1_2
|
|
LOAD2x1_2O 0,0
|
|
.endm
|
|
|
|
|
|
.macro LOAD2x1_2O OffsetA,OffsetB
|
|
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
|
|
lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
|
|
lxv vs52, (\OffsetB+32)(BO) // load real,imag from B
|
|
lxv vs54, (\OffsetB+48)(BO) // load real,imag from B
|
|
xxswapd vs49, vs48
|
|
xxswapd vs51, vs50
|
|
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
|
|
lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
|
|
.endm
|
|
|
|
|
|
.macro END2x1_2
|
|
/*for load2 offset will be 32 and 64*/
|
|
KERNEL2x1_2 AO,BO, 32,64,0 ,1,1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
xxswapd vs53, vs52
|
|
xxswapd vs55, vs54
|
|
xvmaddadp vs0, vs32, vs48
|
|
xvmaddadp vs2, vs32, vs50
|
|
xvmaddadp vs1, vs32, vs49
|
|
xvmaddadp vs3, vs32, vs51
|
|
.if \Complete==0
|
|
lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
|
|
.endif
|
|
.if \Complete==0
|
|
lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
|
|
lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
|
|
.endif
|
|
.if \Complete==0
|
|
xxswapd vs49, vs48
|
|
xxswapd vs51, vs50
|
|
.endif
|
|
xvmaddadp vs0, vs40, vs52
|
|
xvmaddadp vs2, vs40, vs54
|
|
xvmaddadp vs1, vs40, vs53
|
|
xvmaddadp vs3, vs40, vs55
|
|
.if \Complete==0
|
|
lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
|
|
lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \AREG, \AREG, DISP2(\Index,\OffsetA)
|
|
addi \BREG, \BREG, DISP4(\Index,\OffsetB)
|
|
.else
|
|
addi \AREG, \AREG, DISP2(\Index,32)
|
|
addi \BREG, \BREG, DISP4(\Index,64)
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x1
|
|
LOAD2x1
|
|
END2x1 AO, BO, 16,32
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x1
|
|
add T1, CO ,LDC
|
|
SAVE1 vs0,vs1,CO,0
|
|
SAVE1 vs2,vs3,T1,0
|
|
addi CO, CO, 16
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=1 and M=8
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL1x8_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
xxsetaccz 2
|
|
xxsetaccz 3
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x8_2 Index,IsLast
|
|
lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A
|
|
lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A
|
|
lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A
|
|
lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A
|
|
lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A
|
|
lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A
|
|
lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A
|
|
lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A
|
|
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs36, vs49
|
|
xvf64gerpp 3, vs38, vs49
|
|
xvf64gerpp 0, vs40, vs48
|
|
xvf64gerpp 1, vs42, vs48
|
|
xvf64gerpp 2, vs44, vs48
|
|
xvf64gerpp 3, vs46, vs48
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP16(\Index,256)
|
|
addi BO, BO, DISP2(\Index,32)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_1x8 OffsetA,OffsetB
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs34, 32(AO) // load real,imag from A
|
|
lxvp vs36, 64(AO) // load real,imag from A
|
|
lxvp vs38, 96(AO) // load real,imag from A
|
|
lxv vs48, 0(BO) // load real imag from B
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
xvf64gerpp 2, vs36, vs48
|
|
xvf64gerpp 3, vs38, vs48
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x8_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
xxmfacc 2
|
|
xxmfacc 3
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x8
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
xxpermdi vs40, vs8, vs9, 0b01
|
|
xxpermdi vs41, vs8, vs9, 0b10
|
|
xxpermdi vs42, vs10, vs11, 0b01
|
|
xxpermdi vs43, vs10, vs11, 0b10
|
|
xxpermdi vs44, vs12, vs13, 0b01
|
|
xxpermdi vs45, vs12, vs13, 0b10
|
|
xxpermdi vs46, vs14, vs15, 0b01
|
|
xxpermdi vs47, vs14, vs15, 0b10
|
|
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
xxlor vs10, vs40, vs40
|
|
xxlor vs11, vs41, vs41
|
|
xxlor vs8, vs42, vs42
|
|
xxlor vs9, vs43, vs43
|
|
xxlor vs14, vs44, vs44
|
|
xxlor vs15, vs45, vs45
|
|
xxlor vs12, vs46, vs46
|
|
xxlor vs13, vs47, vs47
|
|
|
|
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
|
|
addi CO, CO, 128
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=1 and M=4
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL1x4_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x4_2 Index,IsLast
|
|
lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A
|
|
lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A
|
|
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
|
|
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
|
|
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 0, vs40, vs48
|
|
xvf64gerpp 1, vs42, vs48
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP8(\Index,128)
|
|
addi BO, BO, DISP2(\Index,32)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_1x4 OffsetA,OffsetB
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs34, 32(AO) // load real,imag from A
|
|
lxv vs48, 0(BO) // load real imag from B
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x4_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x4
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
|
|
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
|
|
addi CO, CO, 64
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=1 and M=2
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL1x2_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x2_2 Index,IsLast
|
|
lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A
|
|
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
|
|
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 0, vs40, vs48
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP4(\Index,64)
|
|
addi BO, BO, DISP2(\Index,32)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_1x2 OffsetA,OffsetB
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxv vs48, 0(BO) // load real imag from B
|
|
xvf64gerpp 0, vs32, vs48
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x2_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x2
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
|
|
SAVE2 vs0,vs1,vs2,vs3,CO,0
|
|
addi CO, CO, 32
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=1 and M=1
|
|
**********************************************************************************************/
|
|
|
|
.macro ZERO1x1
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
.endm
|
|
|
|
|
|
.macro LOAD1x1
|
|
LOAD1x1O 0,0
|
|
.endm
|
|
|
|
|
|
.macro LOAD1x1O OffsetA,OffsetB
|
|
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
|
|
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
|
|
xxswapd vs49, vs48
|
|
|
|
.endm
|
|
|
|
|
|
.macro END1x1_WITHOUT_ADD
|
|
END1x1 AO,BO,0,0
|
|
.endm
|
|
|
|
|
|
.macro END1x1 AREG, BREG, OffsetA, OffsetB
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
xvmaddadp vs0, vs32, vs48
|
|
xvmaddadp vs1, vs32, vs49
|
|
.endm
|
|
|
|
|
|
.macro LOAD1x1_2
|
|
LOAD1x1_2O 0,0
|
|
.endm
|
|
|
|
|
|
.macro LOAD1x1_2O OffsetA,OffsetB
|
|
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
|
|
lxv vs52, (\OffsetB+16)(BO) // load real,imag from B
|
|
xxswapd vs49, vs48
|
|
|
|
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
|
|
lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
|
|
.endm
|
|
|
|
|
|
.macro END1x1_2
|
|
/*for load2 offset will be 32 and 32*/
|
|
KERNEL1x1_2 AO,BO, 32,32,0 ,1,1
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
xxswapd vs53, vs52
|
|
xvmaddadp vs0, vs32, vs48
|
|
xvmaddadp vs1, vs32, vs49
|
|
.if \Complete==0
|
|
lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
|
|
.endif
|
|
.if \Complete==0
|
|
lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
|
|
.endif
|
|
.if \Complete==0
|
|
xxswapd vs49, vs48
|
|
.endif
|
|
xvmaddadp vs0, vs40, vs52
|
|
xvmaddadp vs1, vs40, vs53
|
|
.if \Complete==0
|
|
lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \AREG, \AREG, DISP2(\Index,\OffsetA)
|
|
addi \BREG, \BREG, DISP2(\Index,\OffsetB)
|
|
.else
|
|
addi \AREG, \AREG, DISP2(\Index,32)
|
|
addi \BREG, \BREG, DISP2(\Index,32)
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL1x1
|
|
LOAD1x1
|
|
END1x1 AO, BO, 16,16
|
|
.endm
|
|
|
|
|
|
|
|
.macro SAVE1x1
|
|
SAVE1 vs0,vs1,CO,0
|
|
addi CO, CO, 16
|
|
.endm
|
|
|
|
/****************************TRMM POINTER REFRESH
|
|
|
|
.macroSES*************************/
|
|
|
|
|
|
.macro SHIFT_REG REG1,REG2,SHIFT_VAL
|
|
.if \SHIFT_VAL==16
|
|
slwi \REG1, \REG2, 8
|
|
.elseif \SHIFT_VAL==8
|
|
slwi \REG1, \REG2, 7
|
|
.elseif \SHIFT_VAL==4
|
|
slwi \REG1, \REG2, 6
|
|
.elseif \SHIFT_VAL==2
|
|
slwi \REG1, \REG2, 5
|
|
.elseif \SHIFT_VAL==1
|
|
slwi \REG1, \REG2, 4
|
|
.endif
|
|
.endm
|
|
/*
|
|
//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
// ptrbb = bb;
|
|
// #else
|
|
// ptrba += off*16;
|
|
// ptrbb = bb + off*2;
|
|
// #endif
|
|
*/
|
|
|
|
|
|
.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
/* ptrbb = bb;*/
|
|
mr \PTR_B,\B_VAL /* refresh BPOINT */
|
|
#else
|
|
/*
|
|
// ptrba =ptrba+ off*C_A;
|
|
// ptrbb = bb + off*C_B;
|
|
*/
|
|
SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
|
|
SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
|
|
add \PTR_B, \B_VAL , T4 /* Add values to BO */
|
|
add \PTR_A, \PTR_A, T2 /* Add values to AO */
|
|
#endif
|
|
.endm
|
|
|
|
/*
|
|
// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
// temp = bk-off;
|
|
// #elif defined(LEFT)
|
|
// temp = off+16; // number of values in A
|
|
// #else
|
|
// temp = off+2; // number of values in B
|
|
// #endif
|
|
*/
|
|
|
|
|
|
.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
/* temp = bk-off;*/
|
|
sub \TEMP_BK,\BK_VAL,\OFF_VAL
|
|
#elif defined(LEFT)
|
|
/* temp = off+INCR_A; // number of values in A */
|
|
addi \TEMP_BK, \OFF_VAL, \INCR_A
|
|
#else
|
|
/* temp = off+INCR_B // number of values in B*/
|
|
addi \TEMP_BK,\OFF_VAL, \INCR_B
|
|
#endif
|
|
.endm
|
|
/*
|
|
// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
// temp = bk - off;
|
|
// #ifdef LEFT
|
|
// temp -= 16; // number of values in A
|
|
// #else
|
|
// temp -= 2; // number of values in B
|
|
// #endif
|
|
// ptrba += temp*16;
|
|
// ptrbb += temp*2;
|
|
// #endif
|
|
// #ifdef LEFT
|
|
// off += 16; // number of values in A
|
|
// #endif
|
|
*/
|
|
|
|
|
|
|
|
.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
|
|
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
/*temp = bk - off;*/
|
|
sub \TEMP_BK,\BK_VAL,\OFF_VAL
|
|
#ifdef LEFT
|
|
/*temp -= 8; // number of values in A*/
|
|
addi \TEMP_BK,\TEMP_BK,-\C_A
|
|
#else
|
|
/*temp -= 4; // number of values in B*/
|
|
addi \TEMP_BK,\TEMP_BK,-\C_B
|
|
#endif
|
|
/*ptrba += temp*C_A;
|
|
ptrbb += temp*C_B;*/
|
|
SHIFT_REG T4,\TEMP_BK,\C_A
|
|
SHIFT_REG T2,\TEMP_BK,\C_B
|
|
add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
|
|
add \PTR_B, \PTR_B,T2
|
|
#endif
|
|
#ifdef LEFT
|
|
/*off += 8; // number of values in A*/
|
|
addi \OFF_VAL,\OFF_VAL,\C_A
|
|
#endif
|
|
.endm
|
|
|