OpenBLAS/kernel/power/zgemm_macros_power10.S

1350 lines
38 KiB
ArmAsm

/***************************************************************************
Copyright (c) 2013-2020, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#define unit_size 16
#define DISP32(ind,disp) (ind*unit_size*32+disp)
#define DISP16(ind,disp) (ind*unit_size*16+disp)
#define DISP8(ind,disp) (ind*unit_size*8+disp)
#define DISP4(ind,disp) (ind*unit_size*4+disp)
#define DISP2(ind,disp) (ind*unit_size*2+disp)
#define DISP1(ind,disp) (ind*unit_size+disp)
#define DISPX(disp) (disp)
/* HELPERS FOR SAVE */
/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
#ifndef TRMMKERNEL
lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2
xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2
#else
xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
#endif
#endif
.endm
/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
#else
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
#endif
.endm
/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
#else
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
#endif
.endm
/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
#else // CC || CR || RC || RR
/*we will assume {-alpha_r,-alpha_i} for this case */
/*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
/*we will negate alpha image instead instead to fix sign*/
xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
#endif
.endm
/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
#ifndef TRMMKERNEL
xvmsubadp \VSOUT1,\VSINII, alpha_i
xvmaddadp \VSOUT2,\VSINRR, alpha_i
#else
xvmuldp \VSOUT1,\VSINII, alpha_i
xvmuldp \VSOUT2,\VSINRR, alpha_i
#endif
.endm
/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
xvmsubadp \VSOUT1,\VSINRR, alpha_r
xvmaddadp \VSOUT2,\VSINII, alpha_r
.endm
/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd \VSOUT1,\VSIN1,\VSIN2
xxmrgld \VSOUT2,\VSIN1,\VSIN2
#else
xxmrghd \VSOUT1,\VSIN2,\VSIN1
xxmrgld \VSOUT2,\VSIN2,\VSIN1
#endif
.endm
.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
stxv \VSIN1, DISPX(\LOFFSET)(\REG)
stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
.endm
.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64)
RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96)
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45
AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
MULT_APLHA_PART1 vs38,vs40,vs48,vs49
MULT_APLHA_PART2 vs34,vs36,vs46,vs47
AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45
MULT_APLHA_PART2 vs38,vs40,vs48,vs49
AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
MULT_APLHA_PART1 vs42,vs44, vs56,vs57
UNPACK_FOR_STORE vs48,vs49,vs35,vs37
MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59
STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
MULT_APLHA_PART2 vs42,vs44,vs56,vs57
STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59
UNPACK_FOR_STORE vs56,vs57,vs42,vs44
UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3
STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44
STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
.endm
.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
MULT_APLHA_PART1 vs38,vs40, vs48,vs49
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
MULT_APLHA_PART2 vs38,vs40,vs48,vs49
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
UNPACK_FOR_STORE vs48,vs49,vs35,vs37
STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
.endm
.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
.endm
.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
#ifndef TRMMKERNEL
lxv vs50, (\LOFFSET)(\BASE_REG)
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxmrghd vs46,vs50,vs50
xxmrgld vs47,vs50,vs50
#else
xxmrgld vs46,vs50,vs50
xxmrghd vs47,vs50,vs50
#endif
#endif
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
xxmrghd vs39,vs47,vs46
#endif
stxv vs39, (\LOFFSET)(\BASE_REG)
.endm
/**********************************************************************************************
*
.macros for N=2 and M=8
**********************************************************************************************/
.macro KERNEL2x8_ZERO_AND_PRIME_MMA
/* zero out and prime the MMA accumulators */
xxsetaccz 0
xxsetaccz 1
xxsetaccz 2
xxsetaccz 3
xxsetaccz 4
xxsetaccz 5
xxsetaccz 6
xxsetaccz 7
.endm
.macro KERNEL2x8_PRELOAD
lxvp vs32, 0(AO) // load real,imag from A
lxvp vs34, 32(AO) // load real,imag from A
lxvp vs36, 64(AO) // load real,imag from A
lxvp vs38, 96(AO) // load real,imag from A
lxvp vs48, 0(BO) // load real imag from B
.endm
.macro KERNEL2x8_2 Index, IsLast
lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A
lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A
lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A
lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs36, vs48
xvf64gerpp 3, vs38, vs48
xvf64gerpp 4, vs32, vs49
xvf64gerpp 5, vs34, vs49
xvf64gerpp 6, vs36, vs49
xvf64gerpp 7, vs38, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs36, vs49
xvf64gerpp 3, vs38, vs49
xvf64gerpp 4, vs32, vs48
xvf64gerpp 5, vs34, vs48
xvf64gerpp 6, vs36, vs48
xvf64gerpp 7, vs38, vs48
#endif
lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A
lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A
lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A
lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs40, vs50
xvf64gerpp 1, vs42, vs50
xvf64gerpp 2, vs44, vs50
xvf64gerpp 3, vs46, vs50
xvf64gerpp 4, vs40, vs51
xvf64gerpp 5, vs42, vs51
xvf64gerpp 6, vs44, vs51
xvf64gerpp 7, vs46, vs51
#else
xvf64gerpp 0, vs40, vs51
xvf64gerpp 1, vs42, vs51
xvf64gerpp 2, vs44, vs51
xvf64gerpp 3, vs46, vs51
xvf64gerpp 4, vs40, vs50
xvf64gerpp 5, vs42, vs50
xvf64gerpp 6, vs44, vs50
xvf64gerpp 7, vs46, vs50
#endif
.if \IsLast==1
addi AO, AO, DISP16(\Index,256)
addi BO, BO, DISP4(\Index,64)
.endif
.endm
.macro LOAD_END_2x8 OffsetA,OffsetB
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs36, vs48
xvf64gerpp 3, vs38, vs48
xvf64gerpp 4, vs32, vs49
xvf64gerpp 5, vs34, vs49
xvf64gerpp 6, vs36, vs49
xvf64gerpp 7, vs38, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs36, vs49
xvf64gerpp 3, vs38, vs49
xvf64gerpp 4, vs32, vs48
xvf64gerpp 5, vs34, vs48
xvf64gerpp 6, vs36, vs48
xvf64gerpp 7, vs38, vs48
#endif
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
.macro KERNEL2x8_UNPRIME_MMA
/* "unprime" MMA accumulators */
xxmfacc 0
xxmfacc 1
xxmfacc 2
xxmfacc 3
xxmfacc 4
xxmfacc 5
xxmfacc 6
xxmfacc 7
.endm
.macro SAVE2x8
add T1, CO ,LDC
xxpermdi vs32, vs0, vs1, 0b01
xxpermdi vs33, vs0, vs1, 0b10
xxpermdi vs34, vs2, vs3, 0b01
xxpermdi vs35, vs2, vs3, 0b10
xxpermdi vs36, vs4, vs5, 0b01
xxpermdi vs37, vs4, vs5, 0b10
xxpermdi vs38, vs6, vs7, 0b01
xxpermdi vs39, vs6, vs7, 0b10
xxpermdi vs40, vs8, vs9, 0b01
xxpermdi vs41, vs8, vs9, 0b10
xxpermdi vs42, vs10, vs11, 0b01
xxpermdi vs43, vs10, vs11, 0b10
xxpermdi vs44, vs12, vs13, 0b01
xxpermdi vs45, vs12, vs13, 0b10
xxpermdi vs46, vs14, vs15, 0b01
xxpermdi vs47, vs14, vs15, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
xxlor vs8, vs40, vs40
xxlor vs9, vs41, vs41
xxlor vs10, vs42, vs42
xxlor vs11, vs43, vs43
xxlor vs12, vs44, vs44
xxlor vs13, vs45, vs45
xxlor vs14, vs46, vs46
xxlor vs15, vs47, vs47
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs1, vs35, vs35
xxlor vs6, vs36, vs36
xxlor vs7, vs37, vs37
xxlor vs4, vs38, vs38
xxlor vs5, vs39, vs39
xxlor vs10, vs40, vs40
xxlor vs11, vs41, vs41
xxlor vs8, vs42, vs42
xxlor vs9, vs43, vs43
xxlor vs14, vs44, vs44
xxlor vs15, vs45, vs45
xxlor vs12, vs46, vs46
xxlor vs13, vs47, vs47
#endif
xxpermdi vs32, vs16, vs17, 0b01
xxpermdi vs33, vs16, vs17, 0b10
xxpermdi vs34, vs18, vs19, 0b01
xxpermdi vs35, vs18, vs19, 0b10
xxpermdi vs36, vs20, vs21, 0b01
xxpermdi vs37, vs20, vs21, 0b10
xxpermdi vs38, vs22, vs23, 0b01
xxpermdi vs39, vs22, vs23, 0b10
xxpermdi vs40, vs24, vs25, 0b01
xxpermdi vs41, vs24, vs25, 0b10
xxpermdi vs42, vs26, vs27, 0b01
xxpermdi vs43, vs26, vs27, 0b10
xxpermdi vs44, vs28, vs29, 0b01
xxpermdi vs45, vs28, vs29, 0b10
xxpermdi vs46, vs30, vs31, 0b01
xxpermdi vs47, vs30, vs31, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs16, vs32, vs32
xxlor vs17, vs33, vs33
xxlor vs18, vs34, vs34
xxlor vs19, vs35, vs35
xxlor vs20, vs36, vs36
xxlor vs21, vs37, vs37
xxlor vs22, vs38, vs38
xxlor vs23, vs39, vs39
xxlor vs24, vs40, vs40
xxlor vs25, vs41, vs41
xxlor vs26, vs42, vs42
xxlor vs27, vs43, vs43
xxlor vs28, vs44, vs44
xxlor vs29, vs45, vs45
xxlor vs30, vs46, vs46
xxlor vs31, vs47, vs47
#else
xxlor vs18, vs32, vs32
xxlor vs19, vs33, vs33
xxlor vs16, vs34, vs34
xxlor vs17, vs35, vs35
xxlor vs22, vs36, vs36
xxlor vs23, vs37, vs37
xxlor vs20, vs38, vs38
xxlor vs21, vs39, vs39
xxlor vs26, vs40, vs40
xxlor vs27, vs41, vs41
xxlor vs24, vs42, vs42
xxlor vs25, vs43, vs43
xxlor vs30, vs44, vs44
xxlor vs31, vs45, vs45
xxlor vs28, vs46, vs46
xxlor vs29, vs47, vs47
#endif
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0
addi CO, CO, 128
.endm
/**********************************************************************************************
*
.macros for N=2 and M=4
**********************************************************************************************/
.macro KERNEL2x4_ZERO_AND_PRIME_MMA
/* zero out and prime the MMA accumulators */
xxsetaccz 0
xxsetaccz 1
xxsetaccz 2
xxsetaccz 3
.endm
.macro KERNEL2x4_PRELOAD
lxvp vs32, 0(AO) // load real,imag from A
lxvp vs34, 32(AO) // load real,imag from A
lxvp vs48, 0(BO) // load real imag from B
.endm
.macro KERNEL2x4_2 Index, IsLast
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs32, vs49
xvf64gerpp 3, vs34, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs32, vs48
xvf64gerpp 3, vs34, vs48
#endif
lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A
lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A
lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs40, vs50
xvf64gerpp 1, vs42, vs50
xvf64gerpp 2, vs40, vs51
xvf64gerpp 3, vs42, vs51
#else
xvf64gerpp 0, vs40, vs51
xvf64gerpp 1, vs42, vs51
xvf64gerpp 2, vs40, vs50
xvf64gerpp 3, vs42, vs50
#endif
.if \IsLast==1
addi AO, AO, DISP8(\Index,128)
addi BO, BO, DISP4(\Index,64)
.endif
.endm
.macro LOAD_END_2x4 OffsetA, OffsetB
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs32, vs49
xvf64gerpp 3, vs34, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs32, vs48
xvf64gerpp 3, vs34, vs48
#endif
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
.macro KERNEL2x4_UNPRIME_MMA
/* "unprime" MMA accumulators */
xxmfacc 0
xxmfacc 1
xxmfacc 2
xxmfacc 3
.endm
.macro SAVE2x4
add T1, CO ,LDC
xxpermdi vs32, vs0, vs1, 0b01
xxpermdi vs33, vs0, vs1, 0b10
xxpermdi vs34, vs2, vs3, 0b01
xxpermdi vs35, vs2, vs3, 0b10
xxpermdi vs36, vs4, vs5, 0b01
xxpermdi vs37, vs4, vs5, 0b10
xxpermdi vs38, vs6, vs7, 0b01
xxpermdi vs39, vs6, vs7, 0b10
xxpermdi vs40, vs8, vs9, 0b01
xxpermdi vs41, vs8, vs9, 0b10
xxpermdi vs42, vs10, vs11, 0b01
xxpermdi vs43, vs10, vs11, 0b10
xxpermdi vs44, vs12, vs13, 0b01
xxpermdi vs45, vs12, vs13, 0b10
xxpermdi vs46, vs14, vs15, 0b01
xxpermdi vs47, vs14, vs15, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
xxlor vs8, vs40, vs40
xxlor vs9, vs41, vs41
xxlor vs10, vs42, vs42
xxlor vs11, vs43, vs43
xxlor vs12, vs44, vs44
xxlor vs13, vs45, vs45
xxlor vs14, vs46, vs46
xxlor vs15, vs47, vs47
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs1, vs35, vs35
xxlor vs6, vs36, vs36
xxlor vs7, vs37, vs37
xxlor vs4, vs38, vs38
xxlor vs5, vs39, vs39
xxlor vs10, vs40, vs40
xxlor vs11, vs41, vs41
xxlor vs8, vs42, vs42
xxlor vs9, vs43, vs43
xxlor vs14, vs44, vs44
xxlor vs15, vs45, vs45
xxlor vs12, vs46, vs46
xxlor vs13, vs47, vs47
#endif
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0
addi CO, CO, 64
.endm
/**********************************************************************************************
*
.macros for N=2 and M=2
**********************************************************************************************/
.macro KERNEL2x2_ZERO_AND_PRIME_MMA
/* zero out and prime the MMA accumulators */
xxsetaccz 0
xxsetaccz 1
.endm
.macro KERNEL2x2_PRELOAD
lxvp vs32, 0(AO) // load real,imag from A
lxvp vs48, 0(BO) // load real imag from B
.endm
.macro KERNEL2x2_2 Index, IsLast
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs32, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs32, vs48
#endif
lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs40, vs50
xvf64gerpp 1, vs40, vs51
#else
xvf64gerpp 0, vs40, vs51
xvf64gerpp 1, vs40, vs50
#endif
.if \IsLast==1
addi AO, AO, DISP4(\Index,64)
addi BO, BO, DISP4(\Index,64)
.endif
.endm
.macro LOAD_END_2x2 OffsetA,OffsetB
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs32, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs32, vs48
#endif
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
.macro KERNEL2x2_UNPRIME_MMA
/* "unprime" MMA accumulators */
xxmfacc 0
xxmfacc 1
.endm
.macro SAVE2x2
add T1, CO ,LDC
xxpermdi vs32, vs0, vs1, 0b01
xxpermdi vs33, vs0, vs1, 0b10
xxpermdi vs34, vs2, vs3, 0b01
xxpermdi vs35, vs2, vs3, 0b10
xxpermdi vs36, vs4, vs5, 0b01
xxpermdi vs37, vs4, vs5, 0b10
xxpermdi vs38, vs6, vs7, 0b01
xxpermdi vs39, vs6, vs7, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs1, vs35, vs35
xxlor vs6, vs36, vs36
xxlor vs7, vs37, vs37
xxlor vs4, vs38, vs38
xxlor vs5, vs39, vs39
#endif
SAVE2 vs0,vs1,vs2,vs3,CO,0
SAVE2 vs4,vs5,vs6,vs7,T1,0
addi CO, CO, 32
.endm
/**********************************************************************************************
*
.macros for N=2 and M=1
**********************************************************************************************/
.macro ZERO2x1
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
xxlxor vs2, vs2, vs2
xxlxor vs3, vs3, vs3
.endm
.macro LOAD2x1
LOAD2x1O 0,0
.endm
.macro LOAD2x1O OffsetA,OffsetB
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
xxswapd vs49, vs48
xxswapd vs51, vs50
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
.endm
.macro END2x1_WITHOUT_ADD
END2x1 AO,BO,0,0
.endm
.macro END2x1 AREG, BREG, OffsetA, OffsetB
.if \OffsetB != 0
addi \BREG, \BREG, \OffsetB
.endif
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
xvmaddadp vs0, vs32, vs48
xvmaddadp vs2, vs32, vs50
xvmaddadp vs1, vs32, vs49
xvmaddadp vs3, vs32, vs51
.endm
.macro LOAD2x1_2
LOAD2x1_2O 0,0
.endm
.macro LOAD2x1_2O OffsetA,OffsetB
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
lxv vs52, (\OffsetB+32)(BO) // load real,imag from B
lxv vs54, (\OffsetB+48)(BO) // load real,imag from B
xxswapd vs49, vs48
xxswapd vs51, vs50
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
.endm
.macro END2x1_2
/*for load2 offset will be 32 and 64*/
KERNEL2x1_2 AO,BO, 32,64,0 ,1,1
.endm
.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
.endm
.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
xxswapd vs53, vs52
xxswapd vs55, vs54
xvmaddadp vs0, vs32, vs48
xvmaddadp vs2, vs32, vs50
xvmaddadp vs1, vs32, vs49
xvmaddadp vs3, vs32, vs51
.if \Complete==0
lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
.endif
.if \Complete==0
lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
.endif
.if \Complete==0
xxswapd vs49, vs48
xxswapd vs51, vs50
.endif
xvmaddadp vs0, vs40, vs52
xvmaddadp vs2, vs40, vs54
xvmaddadp vs1, vs40, vs53
xvmaddadp vs3, vs40, vs55
.if \Complete==0
lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
.endif
.if \Complete==0
lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
.endif
.if \IsLast==1
.if \Complete==1
addi \AREG, \AREG, DISP2(\Index,\OffsetA)
addi \BREG, \BREG, DISP4(\Index,\OffsetB)
.else
addi \AREG, \AREG, DISP2(\Index,32)
addi \BREG, \BREG, DISP4(\Index,64)
.endif
.endif
.endm
.macro KERNEL2x1
LOAD2x1
END2x1 AO, BO, 16,32
.endm
.macro SAVE2x1
add T1, CO ,LDC
SAVE1 vs0,vs1,CO,0
SAVE1 vs2,vs3,T1,0
addi CO, CO, 16
.endm
/**********************************************************************************************
*
.macros for N=1 and M=8
**********************************************************************************************/
.macro KERNEL1x8_ZERO_AND_PRIME_MMA
/* zero out and prime the MMA accumulators */
xxsetaccz 0
xxsetaccz 1
xxsetaccz 2
xxsetaccz 3
.endm
.macro KERNEL1x8_2 Index,IsLast
lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A
lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A
lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A
lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A
lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A
lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A
lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A
lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs36, vs48
xvf64gerpp 3, vs38, vs48
xvf64gerpp 0, vs40, vs49
xvf64gerpp 1, vs42, vs49
xvf64gerpp 2, vs44, vs49
xvf64gerpp 3, vs46, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 2, vs36, vs49
xvf64gerpp 3, vs38, vs49
xvf64gerpp 0, vs40, vs48
xvf64gerpp 1, vs42, vs48
xvf64gerpp 2, vs44, vs48
xvf64gerpp 3, vs46, vs48
#endif
.if \IsLast==1
addi AO, AO, DISP16(\Index,256)
addi BO, BO, DISP2(\Index,32)
.endif
.endm
.macro LOAD_END_1x8 OffsetA,OffsetB
lxvp vs32, 0(AO) // load real,imag from A
lxvp vs34, 32(AO) // load real,imag from A
lxvp vs36, 64(AO) // load real,imag from A
lxvp vs38, 96(AO) // load real,imag from A
lxv vs48, 0(BO) // load real imag from B
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 2, vs36, vs48
xvf64gerpp 3, vs38, vs48
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
.macro KERNEL1x8_UNPRIME_MMA
/* "unprime" MMA accumulators */
xxmfacc 0
xxmfacc 1
xxmfacc 2
xxmfacc 3
.endm
.macro SAVE1x8
xxpermdi vs32, vs0, vs1, 0b01
xxpermdi vs33, vs0, vs1, 0b10
xxpermdi vs34, vs2, vs3, 0b01
xxpermdi vs35, vs2, vs3, 0b10
xxpermdi vs36, vs4, vs5, 0b01
xxpermdi vs37, vs4, vs5, 0b10
xxpermdi vs38, vs6, vs7, 0b01
xxpermdi vs39, vs6, vs7, 0b10
xxpermdi vs40, vs8, vs9, 0b01
xxpermdi vs41, vs8, vs9, 0b10
xxpermdi vs42, vs10, vs11, 0b01
xxpermdi vs43, vs10, vs11, 0b10
xxpermdi vs44, vs12, vs13, 0b01
xxpermdi vs45, vs12, vs13, 0b10
xxpermdi vs46, vs14, vs15, 0b01
xxpermdi vs47, vs14, vs15, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
xxlor vs8, vs40, vs40
xxlor vs9, vs41, vs41
xxlor vs10, vs42, vs42
xxlor vs11, vs43, vs43
xxlor vs12, vs44, vs44
xxlor vs13, vs45, vs45
xxlor vs14, vs46, vs46
xxlor vs15, vs47, vs47
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs1, vs35, vs35
xxlor vs6, vs36, vs36
xxlor vs7, vs37, vs37
xxlor vs4, vs38, vs38
xxlor vs5, vs39, vs39
xxlor vs10, vs40, vs40
xxlor vs11, vs41, vs41
xxlor vs8, vs42, vs42
xxlor vs9, vs43, vs43
xxlor vs14, vs44, vs44
xxlor vs15, vs45, vs45
xxlor vs12, vs46, vs46
xxlor vs13, vs47, vs47
#endif
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
addi CO, CO, 128
.endm
/**********************************************************************************************
*
.macros for N=1 and M=4
**********************************************************************************************/
.macro KERNEL1x4_ZERO_AND_PRIME_MMA
/* zero out and prime the MMA accumulators */
xxsetaccz 0
xxsetaccz 1
.endm
.macro KERNEL1x4_2 Index,IsLast
lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A
lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
xvf64gerpp 0, vs40, vs49
xvf64gerpp 1, vs42, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 1, vs34, vs49
xvf64gerpp 0, vs40, vs48
xvf64gerpp 1, vs42, vs48
#endif
.if \IsLast==1
addi AO, AO, DISP8(\Index,128)
addi BO, BO, DISP2(\Index,32)
.endif
.endm
.macro LOAD_END_1x4 OffsetA,OffsetB
lxvp vs32, 0(AO) // load real,imag from A
lxvp vs34, 32(AO) // load real,imag from A
lxv vs48, 0(BO) // load real imag from B
xvf64gerpp 0, vs32, vs48
xvf64gerpp 1, vs34, vs48
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
.macro KERNEL1x4_UNPRIME_MMA
/* "unprime" MMA accumulators */
xxmfacc 0
xxmfacc 1
.endm
.macro SAVE1x4
xxpermdi vs32, vs0, vs1, 0b01
xxpermdi vs33, vs0, vs1, 0b10
xxpermdi vs34, vs2, vs3, 0b01
xxpermdi vs35, vs2, vs3, 0b10
xxpermdi vs36, vs4, vs5, 0b01
xxpermdi vs37, vs4, vs5, 0b10
xxpermdi vs38, vs6, vs7, 0b01
xxpermdi vs39, vs6, vs7, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
xxlor vs4, vs36, vs36
xxlor vs5, vs37, vs37
xxlor vs6, vs38, vs38
xxlor vs7, vs39, vs39
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs1, vs35, vs35
xxlor vs6, vs36, vs36
xxlor vs7, vs37, vs37
xxlor vs4, vs38, vs38
xxlor vs5, vs39, vs39
#endif
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
addi CO, CO, 64
.endm
/**********************************************************************************************
*
.macros for N=1 and M=2
**********************************************************************************************/
.macro KERNEL1x2_ZERO_AND_PRIME_MMA
/* zero out and prime the MMA accumulators */
xxsetaccz 0
.endm
.macro KERNEL1x2_2 Index,IsLast
lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xvf64gerpp 0, vs32, vs48
xvf64gerpp 0, vs40, vs49
#else
xvf64gerpp 0, vs32, vs49
xvf64gerpp 0, vs40, vs48
#endif
.if \IsLast==1
addi AO, AO, DISP4(\Index,64)
addi BO, BO, DISP2(\Index,32)
.endif
.endm
.macro LOAD_END_1x2 OffsetA,OffsetB
lxvp vs32, 0(AO) // load real,imag from A
lxv vs48, 0(BO) // load real imag from B
xvf64gerpp 0, vs32, vs48
addi BO, BO, \OffsetB
addi AO, AO, \OffsetA
.endm
.macro KERNEL1x2_UNPRIME_MMA
/* "unprime" MMA accumulators */
xxmfacc 0
.endm
.macro SAVE1x2
xxpermdi vs32, vs0, vs1, 0b01
xxpermdi vs33, vs0, vs1, 0b10
xxpermdi vs34, vs2, vs3, 0b01
xxpermdi vs35, vs2, vs3, 0b10
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
xxlor vs0, vs32, vs32
xxlor vs1, vs33, vs33
xxlor vs2, vs34, vs34
xxlor vs3, vs35, vs35
#else
xxlor vs2, vs32, vs32
xxlor vs3, vs33, vs33
xxlor vs0, vs34, vs34
xxlor vs1, vs35, vs35
#endif
SAVE2 vs0,vs1,vs2,vs3,CO,0
addi CO, CO, 32
.endm
/**********************************************************************************************
*
.macros for N=1 and M=1
**********************************************************************************************/
.macro ZERO1x1
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
.endm
.macro LOAD1x1
LOAD1x1O 0,0
.endm
.macro LOAD1x1O OffsetA,OffsetB
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
xxswapd vs49, vs48
.endm
.macro END1x1_WITHOUT_ADD
END1x1 AO,BO,0,0
.endm
.macro END1x1 AREG, BREG, OffsetA, OffsetB
.if \OffsetB != 0
addi \BREG, \BREG, \OffsetB
.endif
.if \OffsetA != 0
addi \AREG, \AREG, \OffsetA
.endif
xvmaddadp vs0, vs32, vs48
xvmaddadp vs1, vs32, vs49
.endm
.macro LOAD1x1_2
LOAD1x1_2O 0,0
.endm
.macro LOAD1x1_2O OffsetA,OffsetB
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
lxv vs52, (\OffsetB+16)(BO) // load real,imag from B
xxswapd vs49, vs48
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
.endm
.macro END1x1_2
/*for load2 offset will be 32 and 32*/
KERNEL1x1_2 AO,BO, 32,32,0 ,1,1
.endm
.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
.endm
.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
.endm
.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
xxswapd vs53, vs52
xvmaddadp vs0, vs32, vs48
xvmaddadp vs1, vs32, vs49
.if \Complete==0
lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
.endif
.if \Complete==0
lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
.endif
.if \Complete==0
xxswapd vs49, vs48
.endif
xvmaddadp vs0, vs40, vs52
xvmaddadp vs1, vs40, vs53
.if \Complete==0
lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
.endif
.if \Complete==0
lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
.endif
.if \IsLast==1
.if \Complete==1
addi \AREG, \AREG, DISP2(\Index,\OffsetA)
addi \BREG, \BREG, DISP2(\Index,\OffsetB)
.else
addi \AREG, \AREG, DISP2(\Index,32)
addi \BREG, \BREG, DISP2(\Index,32)
.endif
.endif
.endm
.macro KERNEL1x1
LOAD1x1
END1x1 AO, BO, 16,16
.endm
.macro SAVE1x1
SAVE1 vs0,vs1,CO,0
addi CO, CO, 16
.endm
/****************************TRMM POINTER REFRESH
.macroSES*************************/
.macro SHIFT_REG REG1,REG2,SHIFT_VAL
.if \SHIFT_VAL==16
slwi \REG1, \REG2, 8
.elseif \SHIFT_VAL==8
slwi \REG1, \REG2, 7
.elseif \SHIFT_VAL==4
slwi \REG1, \REG2, 6
.elseif \SHIFT_VAL==2
slwi \REG1, \REG2, 5
.elseif \SHIFT_VAL==1
slwi \REG1, \REG2, 4
.endif
.endm
/*
//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
// ptrbb = bb;
// #else
// ptrba += off*16;
// ptrbb = bb + off*2;
// #endif
*/
.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
/* ptrbb = bb;*/
mr \PTR_B,\B_VAL /* refresh BPOINT */
#else
/*
// ptrba =ptrba+ off*C_A;
// ptrbb = bb + off*C_B;
*/
SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
add \PTR_B, \B_VAL , T4 /* Add values to BO */
add \PTR_A, \PTR_A, T2 /* Add values to AO */
#endif
.endm
/*
// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
// temp = bk-off;
// #elif defined(LEFT)
// temp = off+16; // number of values in A
// #else
// temp = off+2; // number of values in B
// #endif
*/
.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
/* temp = bk-off;*/
sub \TEMP_BK,\BK_VAL,\OFF_VAL
#elif defined(LEFT)
/* temp = off+INCR_A; // number of values in A */
addi \TEMP_BK, \OFF_VAL, \INCR_A
#else
/* temp = off+INCR_B // number of values in B*/
addi \TEMP_BK,\OFF_VAL, \INCR_B
#endif
.endm
/*
// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
// temp = bk - off;
// #ifdef LEFT
// temp -= 16; // number of values in A
// #else
// temp -= 2; // number of values in B
// #endif
// ptrba += temp*16;
// ptrbb += temp*2;
// #endif
// #ifdef LEFT
// off += 16; // number of values in A
// #endif
*/
.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
/*temp = bk - off;*/
sub \TEMP_BK,\BK_VAL,\OFF_VAL
#ifdef LEFT
/*temp -= 8; // number of values in A*/
addi \TEMP_BK,\TEMP_BK,-\C_A
#else
/*temp -= 4; // number of values in B*/
addi \TEMP_BK,\TEMP_BK,-\C_B
#endif
/*ptrba += temp*C_A;
ptrbb += temp*C_B;*/
SHIFT_REG T4,\TEMP_BK,\C_A
SHIFT_REG T2,\TEMP_BK,\C_B
add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
add \PTR_B, \PTR_B,T2
#endif
#ifdef LEFT
/*off += 8; // number of values in A*/
addi \OFF_VAL,\OFF_VAL,\C_A
#endif
.endm