1350 lines
38 KiB
ArmAsm
1350 lines
38 KiB
ArmAsm
/***************************************************************************
|
|
Copyright (c) 2013-2020, The OpenBLAS Project
|
|
All rights reserved.
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions are
|
|
met:
|
|
1. Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in
|
|
the documentation and/or other materials provided with the
|
|
distribution.
|
|
3. Neither the name of the OpenBLAS project nor the names of
|
|
its contributors may be used to endorse or promote products
|
|
derived from this software without specific prior written permission.
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*****************************************************************************/
|
|
|
|
#define unit_size 16
|
|
#define DISP32(ind,disp) (ind*unit_size*32+disp)
|
|
#define DISP16(ind,disp) (ind*unit_size*16+disp)
|
|
#define DISP8(ind,disp) (ind*unit_size*8+disp)
|
|
#define DISP4(ind,disp) (ind*unit_size*4+disp)
|
|
#define DISP2(ind,disp) (ind*unit_size*2+disp)
|
|
#define DISP1(ind,disp) (ind*unit_size+disp)
|
|
#define DISPX(disp) (disp)
|
|
/* HELPERS FOR SAVE */
|
|
/* {r0,i0} and {r1,i1} into {r0,r1} {i0,i1} */
|
|
|
|
|
|
.macro LOAD_COUPLE_AS_RR_II VS_OUT1,VS_OUT2,VS_TEMP1,VS_TEMP2,REG,LOFFSET
|
|
#ifndef TRMMKERNEL
|
|
lxv \VS_TEMP1, DISPX(\LOFFSET)(\REG)
|
|
lxv \VS_TEMP2, DISPX(\LOFFSET+16)(\REG)
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxmrghd \VS_OUT1,\VS_TEMP1,\VS_TEMP2
|
|
xxmrgld \VS_OUT2,\VS_TEMP1,\VS_TEMP2
|
|
#else
|
|
xxmrgld \VS_OUT1,\VS_TEMP1,\VS_TEMP2
|
|
xxmrghd \VS_OUT2,\VS_TEMP1,\VS_TEMP2
|
|
#endif
|
|
#endif
|
|
.endm
|
|
/*from 2 result {a0r*br,a0i*bi} and {a1r*br,a1i*bi} pack into {a0r*br,a1r*br} and {a0i*bi,a1i*bi}*/
|
|
|
|
|
|
.macro RESULT_INTO_REALREAL_IMAGEIMAGE VSIN1,VSIN2,VSOUT1,VSOUT2
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
|
|
xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
|
|
#else
|
|
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*real from 2 results*/
|
|
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*imag from 2 results*/
|
|
#endif
|
|
.endm
|
|
/*from 2 result {a0r*bi,a0i*br} and {a1r*bi,a1i*br} pack into {a0r*bi,a1r*bi} and {a0i*br,a1i*br}*/
|
|
|
|
|
|
.macro RESULT_INTO_REALIMAG_IMAGREAL VSIN1,VSIN2,VSOUT1,VSOUT2
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxmrghd \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
|
|
xxmrgld \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
|
|
#else
|
|
xxmrgld \VSOUT1, \VSIN1,\VSIN2 /* real*imag */
|
|
xxmrghd \VSOUT2, \VSIN1,\VSIN2 /* imag*real*/
|
|
#endif
|
|
.endm
|
|
/* {a0r*br op a0i*bi ,a1r*br op a1i*bi} ~ {r0,r1}; {a0r*bi op a0i*br ,a1r*bi op a1i*br} ~ {i0,i1}*/
|
|
|
|
|
|
.macro AGGREGATE_REALS_IMAGES VSINR_OUT1,VSINR,VSINI_OUT2,VSINI
|
|
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
|
|
xvsubdp \VSINR_OUT1,\VSINR_OUT1,\VSINR
|
|
xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
|
|
#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
|
|
xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
|
|
xvsubdp \VSINI_OUT2,\VSINI_OUT2,\VSINI
|
|
#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
|
|
xvadddp \VSINR_OUT1,\VSINR_OUT1,\VSINR
|
|
xvsubdp \VSINI_OUT2,\VSINI,\VSINI_OUT2
|
|
#else // CC || CR || RC || RR
|
|
/*we will assume {-alpha_r,-alpha_i} for this case */
|
|
/*i1i2-r1r2 so we will negate alpha real instead to fix sign*/
|
|
xvsubdp \VSINR_OUT1,\VSINR,\VSINR_OUT1
|
|
/*we will negate alpha image instead instead to fix sign*/
|
|
xvadddp \VSINI_OUT2,\VSINI_OUT2,\VSINI
|
|
#endif
|
|
.endm
|
|
/* {i0,i1} * {alpha_i,alpha_i} - VSOUT1 ;VSOUT2 + {r0,r1}*{alpha_i,alpha_i} */
|
|
|
|
|
|
.macro MULT_APLHA_PART1 VSINRR,VSINII,VSOUT1,VSOUT2
|
|
#ifndef TRMMKERNEL
|
|
xvmsubadp \VSOUT1,\VSINII, alpha_i
|
|
xvmaddadp \VSOUT2,\VSINRR, alpha_i
|
|
#else
|
|
xvmuldp \VSOUT1,\VSINII, alpha_i
|
|
xvmuldp \VSOUT2,\VSINRR, alpha_i
|
|
#endif
|
|
.endm
|
|
/* {r0,r1} * {alpha_r,alpha_r} - VSOUT1 ;VSOUT2 + {i0,i1} * {alpha_r,alpha_r} */
|
|
|
|
|
|
.macro MULT_APLHA_PART2 VSINRR,VSINII,VSOUT1,VSOUT2
|
|
xvmsubadp \VSOUT1,\VSINRR, alpha_r
|
|
xvmaddadp \VSOUT2,\VSINII, alpha_r
|
|
.endm
|
|
/* unpack to store 2{r,r} {i,i} into {r,i} {r,i} (big endian because of stxv) */
|
|
|
|
|
|
.macro UNPACK_FOR_STORE VSIN1,VSIN2,VSOUT1,VSOUT2
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxmrghd \VSOUT1,\VSIN1,\VSIN2
|
|
xxmrgld \VSOUT2,\VSIN1,\VSIN2
|
|
#else
|
|
xxmrghd \VSOUT1,\VSIN2,\VSIN1
|
|
xxmrgld \VSOUT2,\VSIN2,\VSIN1
|
|
#endif
|
|
.endm
|
|
|
|
|
|
.macro STORE_COUPLE REG,LOFFSET,VSIN1,VSIN2
|
|
stxv \VSIN1, DISPX(\LOFFSET)(\REG)
|
|
stxv \VSIN2, DISPX(\LOFFSET+16)(\REG)
|
|
.endm
|
|
|
|
|
|
.macro SAVE8 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,VSRes9,VSRes10,VSRes11,VSRes12,VSRes13,VSRes14,VSRes15,VSRes16,BASE_REG,LOFFSET
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
|
|
LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
|
|
LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
|
|
LOAD_COUPLE_AS_RR_II vs56,vs57,vs50,vs51,\BASE_REG,(\LOFFSET +64)
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
|
|
LOAD_COUPLE_AS_RR_II vs58,vs59,vs52,vs53,\BASE_REG,(\LOFFSET+96)
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes9,\VSRes11,vs42,vs43
|
|
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes10,\VSRes12,vs44,vs45
|
|
AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes13,\VSRes15,\VSRes1,\VSRes2
|
|
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes14,\VSRes16,\VSRes3,\VSRes4
|
|
MULT_APLHA_PART1 vs38,vs40,vs48,vs49
|
|
MULT_APLHA_PART2 vs34,vs36,vs46,vs47
|
|
AGGREGATE_REALS_IMAGES vs42,vs43,vs44,vs45
|
|
MULT_APLHA_PART2 vs38,vs40,vs48,vs49
|
|
AGGREGATE_REALS_IMAGES \VSRes1,\VSRes2,\VSRes3,\VSRes4
|
|
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
|
|
MULT_APLHA_PART1 vs42,vs44, vs56,vs57
|
|
UNPACK_FOR_STORE vs48,vs49,vs35,vs37
|
|
MULT_APLHA_PART1 \VSRes1,\VSRes3, vs58,vs59
|
|
STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
|
|
MULT_APLHA_PART2 vs42,vs44,vs56,vs57
|
|
STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
|
|
MULT_APLHA_PART2 \VSRes1,\VSRes3, vs58,vs59
|
|
UNPACK_FOR_STORE vs56,vs57,vs42,vs44
|
|
UNPACK_FOR_STORE vs58,vs59,\VSRes1,\VSRes3
|
|
STORE_COUPLE \BASE_REG,(\LOFFSET +64),vs42,vs44
|
|
STORE_COUPLE \BASE_REG,(\LOFFSET+96),\VSRes1,\VSRes3
|
|
.endm
|
|
|
|
|
|
.macro SAVE4 VSRes1,VSRes2,VSRes3,VSRes4,VSRes5,VSRes6,VSRes7,VSRes8,BASE_REG,LOFFSET
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
|
|
LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
|
|
LOAD_COUPLE_AS_RR_II vs48,vs49,vs52,vs53,\BASE_REG,(\LOFFSET+32)
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes5,\VSRes7,vs38,vs39
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes6,\VSRes8,vs40,vs41
|
|
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
|
|
AGGREGATE_REALS_IMAGES vs38,vs39,vs40,vs41
|
|
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
|
|
MULT_APLHA_PART1 vs38,vs40, vs48,vs49
|
|
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
|
|
MULT_APLHA_PART2 vs38,vs40,vs48,vs49
|
|
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
|
|
UNPACK_FOR_STORE vs48,vs49,vs35,vs37
|
|
STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
|
|
STORE_COUPLE \BASE_REG,(\LOFFSET+32),vs35,vs37
|
|
.endm
|
|
|
|
|
|
.macro SAVE2 VSRes1,VSRes2,VSRes3,VSRes4,BASE_REG,LOFFSET
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes3,vs34,vs35
|
|
LOAD_COUPLE_AS_RR_II vs46,vs47,vs50,vs51,\BASE_REG,\LOFFSET
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes4,vs36,vs37
|
|
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
|
|
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
|
|
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
|
|
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
|
|
STORE_COUPLE \BASE_REG,\LOFFSET,vs39,vs41
|
|
.endm
|
|
|
|
|
|
.macro SAVE1 VSRes1,VSRes2,BASE_REG,LOFFSET
|
|
RESULT_INTO_REALREAL_IMAGEIMAGE \VSRes1,\VSRes1,vs34,vs35
|
|
#ifndef TRMMKERNEL
|
|
lxv vs50, (\LOFFSET)(\BASE_REG)
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxmrghd vs46,vs50,vs50
|
|
xxmrgld vs47,vs50,vs50
|
|
#else
|
|
xxmrgld vs46,vs50,vs50
|
|
xxmrghd vs47,vs50,vs50
|
|
#endif
|
|
#endif
|
|
RESULT_INTO_REALIMAG_IMAGREAL \VSRes2,\VSRes2,vs36,vs37
|
|
AGGREGATE_REALS_IMAGES vs34,vs35,vs36,vs37
|
|
MULT_APLHA_PART1 vs34,vs36, vs46,vs47
|
|
MULT_APLHA_PART2 vs34,vs36, vs46,vs47
|
|
UNPACK_FOR_STORE vs46,vs47,vs39,vs41
|
|
#if (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__)
|
|
xxmrghd vs39,vs47,vs46
|
|
#endif
|
|
stxv vs39, (\LOFFSET)(\BASE_REG)
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=2 and M=8
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL2x8_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
xxsetaccz 2
|
|
xxsetaccz 3
|
|
xxsetaccz 4
|
|
xxsetaccz 5
|
|
xxsetaccz 6
|
|
xxsetaccz 7
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x8_PRELOAD
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs34, 32(AO) // load real,imag from A
|
|
lxvp vs36, 64(AO) // load real,imag from A
|
|
lxvp vs38, 96(AO) // load real,imag from A
|
|
lxvp vs48, 0(BO) // load real imag from B
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x8_2 Index, IsLast
|
|
lxvp vs40, DISP16(\Index,128)(AO) // load real,imag from A
|
|
lxvp vs42, DISP16(\Index,160)(AO) // load real,imag from A
|
|
lxvp vs44, DISP16(\Index,192)(AO) // load real,imag from A
|
|
lxvp vs46, DISP16(\Index,224)(AO) // load real,imag from A
|
|
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
xvf64gerpp 2, vs36, vs48
|
|
xvf64gerpp 3, vs38, vs48
|
|
xvf64gerpp 4, vs32, vs49
|
|
xvf64gerpp 5, vs34, vs49
|
|
xvf64gerpp 6, vs36, vs49
|
|
xvf64gerpp 7, vs38, vs49
|
|
#else
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs36, vs49
|
|
xvf64gerpp 3, vs38, vs49
|
|
xvf64gerpp 4, vs32, vs48
|
|
xvf64gerpp 5, vs34, vs48
|
|
xvf64gerpp 6, vs36, vs48
|
|
xvf64gerpp 7, vs38, vs48
|
|
#endif
|
|
lxvp vs32, DISP16(\Index, 256)(AO) // load real,imag from A
|
|
lxvp vs34, DISP16(\Index, 288)(AO) // load real,imag from A
|
|
lxvp vs36, DISP16(\Index, 320)(AO) // load real,imag from A
|
|
lxvp vs38, DISP16(\Index, 352)(AO) // load real,imag from A
|
|
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs40, vs50
|
|
xvf64gerpp 1, vs42, vs50
|
|
xvf64gerpp 2, vs44, vs50
|
|
xvf64gerpp 3, vs46, vs50
|
|
xvf64gerpp 4, vs40, vs51
|
|
xvf64gerpp 5, vs42, vs51
|
|
xvf64gerpp 6, vs44, vs51
|
|
xvf64gerpp 7, vs46, vs51
|
|
#else
|
|
xvf64gerpp 0, vs40, vs51
|
|
xvf64gerpp 1, vs42, vs51
|
|
xvf64gerpp 2, vs44, vs51
|
|
xvf64gerpp 3, vs46, vs51
|
|
xvf64gerpp 4, vs40, vs50
|
|
xvf64gerpp 5, vs42, vs50
|
|
xvf64gerpp 6, vs44, vs50
|
|
xvf64gerpp 7, vs46, vs50
|
|
#endif
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP16(\Index,256)
|
|
addi BO, BO, DISP4(\Index,64)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_2x8 OffsetA,OffsetB
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
xvf64gerpp 2, vs36, vs48
|
|
xvf64gerpp 3, vs38, vs48
|
|
xvf64gerpp 4, vs32, vs49
|
|
xvf64gerpp 5, vs34, vs49
|
|
xvf64gerpp 6, vs36, vs49
|
|
xvf64gerpp 7, vs38, vs49
|
|
#else
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs36, vs49
|
|
xvf64gerpp 3, vs38, vs49
|
|
xvf64gerpp 4, vs32, vs48
|
|
xvf64gerpp 5, vs34, vs48
|
|
xvf64gerpp 6, vs36, vs48
|
|
xvf64gerpp 7, vs38, vs48
|
|
#endif
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x8_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
xxmfacc 2
|
|
xxmfacc 3
|
|
xxmfacc 4
|
|
xxmfacc 5
|
|
xxmfacc 6
|
|
xxmfacc 7
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x8
|
|
add T1, CO ,LDC
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
xxpermdi vs40, vs8, vs9, 0b01
|
|
xxpermdi vs41, vs8, vs9, 0b10
|
|
xxpermdi vs42, vs10, vs11, 0b01
|
|
xxpermdi vs43, vs10, vs11, 0b10
|
|
xxpermdi vs44, vs12, vs13, 0b01
|
|
xxpermdi vs45, vs12, vs13, 0b10
|
|
xxpermdi vs46, vs14, vs15, 0b01
|
|
xxpermdi vs47, vs14, vs15, 0b10
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxlor vs0, vs32, vs32
|
|
xxlor vs1, vs33, vs33
|
|
xxlor vs2, vs34, vs34
|
|
xxlor vs3, vs35, vs35
|
|
xxlor vs4, vs36, vs36
|
|
xxlor vs5, vs37, vs37
|
|
xxlor vs6, vs38, vs38
|
|
xxlor vs7, vs39, vs39
|
|
xxlor vs8, vs40, vs40
|
|
xxlor vs9, vs41, vs41
|
|
xxlor vs10, vs42, vs42
|
|
xxlor vs11, vs43, vs43
|
|
xxlor vs12, vs44, vs44
|
|
xxlor vs13, vs45, vs45
|
|
xxlor vs14, vs46, vs46
|
|
xxlor vs15, vs47, vs47
|
|
#else
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
xxlor vs10, vs40, vs40
|
|
xxlor vs11, vs41, vs41
|
|
xxlor vs8, vs42, vs42
|
|
xxlor vs9, vs43, vs43
|
|
xxlor vs14, vs44, vs44
|
|
xxlor vs15, vs45, vs45
|
|
xxlor vs12, vs46, vs46
|
|
xxlor vs13, vs47, vs47
|
|
#endif
|
|
xxpermdi vs32, vs16, vs17, 0b01
|
|
xxpermdi vs33, vs16, vs17, 0b10
|
|
xxpermdi vs34, vs18, vs19, 0b01
|
|
xxpermdi vs35, vs18, vs19, 0b10
|
|
xxpermdi vs36, vs20, vs21, 0b01
|
|
xxpermdi vs37, vs20, vs21, 0b10
|
|
xxpermdi vs38, vs22, vs23, 0b01
|
|
xxpermdi vs39, vs22, vs23, 0b10
|
|
xxpermdi vs40, vs24, vs25, 0b01
|
|
xxpermdi vs41, vs24, vs25, 0b10
|
|
xxpermdi vs42, vs26, vs27, 0b01
|
|
xxpermdi vs43, vs26, vs27, 0b10
|
|
xxpermdi vs44, vs28, vs29, 0b01
|
|
xxpermdi vs45, vs28, vs29, 0b10
|
|
xxpermdi vs46, vs30, vs31, 0b01
|
|
xxpermdi vs47, vs30, vs31, 0b10
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxlor vs16, vs32, vs32
|
|
xxlor vs17, vs33, vs33
|
|
xxlor vs18, vs34, vs34
|
|
xxlor vs19, vs35, vs35
|
|
xxlor vs20, vs36, vs36
|
|
xxlor vs21, vs37, vs37
|
|
xxlor vs22, vs38, vs38
|
|
xxlor vs23, vs39, vs39
|
|
xxlor vs24, vs40, vs40
|
|
xxlor vs25, vs41, vs41
|
|
xxlor vs26, vs42, vs42
|
|
xxlor vs27, vs43, vs43
|
|
xxlor vs28, vs44, vs44
|
|
xxlor vs29, vs45, vs45
|
|
xxlor vs30, vs46, vs46
|
|
xxlor vs31, vs47, vs47
|
|
#else
|
|
xxlor vs18, vs32, vs32
|
|
xxlor vs19, vs33, vs33
|
|
xxlor vs16, vs34, vs34
|
|
xxlor vs17, vs35, vs35
|
|
xxlor vs22, vs36, vs36
|
|
xxlor vs23, vs37, vs37
|
|
xxlor vs20, vs38, vs38
|
|
xxlor vs21, vs39, vs39
|
|
xxlor vs26, vs40, vs40
|
|
xxlor vs27, vs41, vs41
|
|
xxlor vs24, vs42, vs42
|
|
xxlor vs25, vs43, vs43
|
|
xxlor vs30, vs44, vs44
|
|
xxlor vs31, vs45, vs45
|
|
xxlor vs28, vs46, vs46
|
|
xxlor vs29, vs47, vs47
|
|
#endif
|
|
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
|
|
SAVE8 vs16,vs17,vs18,vs19,vs20,vs21,vs22,vs23,vs24,vs25,vs26,vs27,vs28,vs29,vs30,vs31,T1,0
|
|
addi CO, CO, 128
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=2 and M=4
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL2x4_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
xxsetaccz 2
|
|
xxsetaccz 3
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x4_PRELOAD
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs34, 32(AO) // load real,imag from A
|
|
lxvp vs48, 0(BO) // load real imag from B
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x4_2 Index, IsLast
|
|
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
|
|
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
|
|
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
xvf64gerpp 2, vs32, vs49
|
|
xvf64gerpp 3, vs34, vs49
|
|
#else
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs32, vs48
|
|
xvf64gerpp 3, vs34, vs48
|
|
#endif
|
|
lxvp vs32, DISP8(\Index, 128)(AO) // load real,imag from A
|
|
lxvp vs34, DISP8(\Index, 160)(AO) // load real,imag from A
|
|
lxvp vs48, DISP4(\Index, 64)(BO) // load real,imag from B
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs40, vs50
|
|
xvf64gerpp 1, vs42, vs50
|
|
xvf64gerpp 2, vs40, vs51
|
|
xvf64gerpp 3, vs42, vs51
|
|
#else
|
|
xvf64gerpp 0, vs40, vs51
|
|
xvf64gerpp 1, vs42, vs51
|
|
xvf64gerpp 2, vs40, vs50
|
|
xvf64gerpp 3, vs42, vs50
|
|
#endif
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP8(\Index,128)
|
|
addi BO, BO, DISP4(\Index,64)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_2x4 OffsetA, OffsetB
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
xvf64gerpp 2, vs32, vs49
|
|
xvf64gerpp 3, vs34, vs49
|
|
#else
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs32, vs48
|
|
xvf64gerpp 3, vs34, vs48
|
|
#endif
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x4_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
xxmfacc 2
|
|
xxmfacc 3
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x4
|
|
add T1, CO ,LDC
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
xxpermdi vs40, vs8, vs9, 0b01
|
|
xxpermdi vs41, vs8, vs9, 0b10
|
|
xxpermdi vs42, vs10, vs11, 0b01
|
|
xxpermdi vs43, vs10, vs11, 0b10
|
|
xxpermdi vs44, vs12, vs13, 0b01
|
|
xxpermdi vs45, vs12, vs13, 0b10
|
|
xxpermdi vs46, vs14, vs15, 0b01
|
|
xxpermdi vs47, vs14, vs15, 0b10
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxlor vs0, vs32, vs32
|
|
xxlor vs1, vs33, vs33
|
|
xxlor vs2, vs34, vs34
|
|
xxlor vs3, vs35, vs35
|
|
xxlor vs4, vs36, vs36
|
|
xxlor vs5, vs37, vs37
|
|
xxlor vs6, vs38, vs38
|
|
xxlor vs7, vs39, vs39
|
|
xxlor vs8, vs40, vs40
|
|
xxlor vs9, vs41, vs41
|
|
xxlor vs10, vs42, vs42
|
|
xxlor vs11, vs43, vs43
|
|
xxlor vs12, vs44, vs44
|
|
xxlor vs13, vs45, vs45
|
|
xxlor vs14, vs46, vs46
|
|
xxlor vs15, vs47, vs47
|
|
#else
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
xxlor vs10, vs40, vs40
|
|
xxlor vs11, vs41, vs41
|
|
xxlor vs8, vs42, vs42
|
|
xxlor vs9, vs43, vs43
|
|
xxlor vs14, vs44, vs44
|
|
xxlor vs15, vs45, vs45
|
|
xxlor vs12, vs46, vs46
|
|
xxlor vs13, vs47, vs47
|
|
#endif
|
|
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
|
|
SAVE4 vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,T1,0
|
|
addi CO, CO, 64
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=2 and M=2
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL2x2_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x2_PRELOAD
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs48, 0(BO) // load real imag from B
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x2_2 Index, IsLast
|
|
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
|
|
lxvp vs50, DISP4(\Index, 32)(BO) // load real,imag from B
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs32, vs49
|
|
#else
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs32, vs48
|
|
#endif
|
|
lxvp vs32, DISP4(\Index, 64)(AO) // load real,imag from A
|
|
lxvp vs48, DISP4(\Index, 64)(BO) // load real imag from B
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs40, vs50
|
|
xvf64gerpp 1, vs40, vs51
|
|
#else
|
|
xvf64gerpp 0, vs40, vs51
|
|
xvf64gerpp 1, vs40, vs50
|
|
#endif
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP4(\Index,64)
|
|
addi BO, BO, DISP4(\Index,64)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_2x2 OffsetA,OffsetB
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs32, vs49
|
|
#else
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs32, vs48
|
|
#endif
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x2_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x2
|
|
add T1, CO ,LDC
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxlor vs0, vs32, vs32
|
|
xxlor vs1, vs33, vs33
|
|
xxlor vs2, vs34, vs34
|
|
xxlor vs3, vs35, vs35
|
|
xxlor vs4, vs36, vs36
|
|
xxlor vs5, vs37, vs37
|
|
xxlor vs6, vs38, vs38
|
|
xxlor vs7, vs39, vs39
|
|
#else
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
#endif
|
|
SAVE2 vs0,vs1,vs2,vs3,CO,0
|
|
SAVE2 vs4,vs5,vs6,vs7,T1,0
|
|
addi CO, CO, 32
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=2 and M=1
|
|
**********************************************************************************************/
|
|
|
|
.macro ZERO2x1
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
xxlxor vs2, vs2, vs2
|
|
xxlxor vs3, vs3, vs3
|
|
|
|
.endm
|
|
|
|
|
|
.macro LOAD2x1
|
|
LOAD2x1O 0,0
|
|
.endm
|
|
|
|
|
|
.macro LOAD2x1O OffsetA,OffsetB
|
|
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
|
|
lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
|
|
xxswapd vs49, vs48
|
|
xxswapd vs51, vs50
|
|
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
|
|
.endm
|
|
|
|
|
|
.macro END2x1_WITHOUT_ADD
|
|
END2x1 AO,BO,0,0
|
|
.endm
|
|
|
|
|
|
.macro END2x1 AREG, BREG, OffsetA, OffsetB
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
xvmaddadp vs0, vs32, vs48
|
|
xvmaddadp vs2, vs32, vs50
|
|
xvmaddadp vs1, vs32, vs49
|
|
xvmaddadp vs3, vs32, vs51
|
|
.endm
|
|
|
|
|
|
.macro LOAD2x1_2
|
|
LOAD2x1_2O 0,0
|
|
.endm
|
|
|
|
|
|
.macro LOAD2x1_2O OffsetA,OffsetB
|
|
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
|
|
lxv vs50, (\OffsetB+16)(BO) // load real,imag from B
|
|
lxv vs52, (\OffsetB+32)(BO) // load real,imag from B
|
|
lxv vs54, (\OffsetB+48)(BO) // load real,imag from B
|
|
xxswapd vs49, vs48
|
|
xxswapd vs51, vs50
|
|
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
|
|
lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
|
|
.endm
|
|
|
|
|
|
.macro END2x1_2
|
|
/*for load2 offset will be 32 and 64*/
|
|
KERNEL2x1_2 AO,BO, 32,64,0 ,1,1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x1_E2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x1_L2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL2x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
xxswapd vs53, vs52
|
|
xxswapd vs55, vs54
|
|
xvmaddadp vs0, vs32, vs48
|
|
xvmaddadp vs2, vs32, vs50
|
|
xvmaddadp vs1, vs32, vs49
|
|
xvmaddadp vs3, vs32, vs51
|
|
.if \Complete==0
|
|
lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
|
|
.endif
|
|
.if \Complete==0
|
|
lxv vs48, DISP4(\Index, 0+\OffsetB)(\BREG) // load real imag from B
|
|
lxv vs50, DISP4(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
|
|
.endif
|
|
.if \Complete==0
|
|
xxswapd vs49, vs48
|
|
xxswapd vs51, vs50
|
|
.endif
|
|
xvmaddadp vs0, vs40, vs52
|
|
xvmaddadp vs2, vs40, vs54
|
|
xvmaddadp vs1, vs40, vs53
|
|
xvmaddadp vs3, vs40, vs55
|
|
.if \Complete==0
|
|
lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
lxv vs52, DISP4(\Index, 32+\OffsetB)(\BREG) // load real,imag from B
|
|
lxv vs54, DISP4(\Index, 48+\OffsetB)(\BREG) // load real,imag from B
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \AREG, \AREG, DISP2(\Index,\OffsetA)
|
|
addi \BREG, \BREG, DISP4(\Index,\OffsetB)
|
|
.else
|
|
addi \AREG, \AREG, DISP2(\Index,32)
|
|
addi \BREG, \BREG, DISP4(\Index,64)
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro KERNEL2x1
|
|
LOAD2x1
|
|
END2x1 AO, BO, 16,32
|
|
.endm
|
|
|
|
|
|
.macro SAVE2x1
|
|
add T1, CO ,LDC
|
|
SAVE1 vs0,vs1,CO,0
|
|
SAVE1 vs2,vs3,T1,0
|
|
addi CO, CO, 16
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=1 and M=8
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL1x8_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
xxsetaccz 2
|
|
xxsetaccz 3
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x8_2 Index,IsLast
|
|
lxvp vs32, DISP16(\Index, 0)(AO) // load real,imag from A
|
|
lxvp vs34, DISP16(\Index, 32)(AO) // load real,imag from A
|
|
lxvp vs36, DISP16(\Index, 64)(AO) // load real,imag from A
|
|
lxvp vs38, DISP16(\Index, 96)(AO) // load real,imag from A
|
|
lxvp vs40, DISP16(\Index, 128)(AO) // load real,imag from A
|
|
lxvp vs42, DISP16(\Index, 160)(AO) // load real,imag from A
|
|
lxvp vs44, DISP16(\Index, 192)(AO) // load real,imag from A
|
|
lxvp vs46, DISP16(\Index, 224)(AO) // load real,imag from A
|
|
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
xvf64gerpp 2, vs36, vs48
|
|
xvf64gerpp 3, vs38, vs48
|
|
xvf64gerpp 0, vs40, vs49
|
|
xvf64gerpp 1, vs42, vs49
|
|
xvf64gerpp 2, vs44, vs49
|
|
xvf64gerpp 3, vs46, vs49
|
|
#else
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 2, vs36, vs49
|
|
xvf64gerpp 3, vs38, vs49
|
|
xvf64gerpp 0, vs40, vs48
|
|
xvf64gerpp 1, vs42, vs48
|
|
xvf64gerpp 2, vs44, vs48
|
|
xvf64gerpp 3, vs46, vs48
|
|
#endif
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP16(\Index,256)
|
|
addi BO, BO, DISP2(\Index,32)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_1x8 OffsetA,OffsetB
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs34, 32(AO) // load real,imag from A
|
|
lxvp vs36, 64(AO) // load real,imag from A
|
|
lxvp vs38, 96(AO) // load real,imag from A
|
|
lxv vs48, 0(BO) // load real imag from B
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
xvf64gerpp 2, vs36, vs48
|
|
xvf64gerpp 3, vs38, vs48
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x8_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
xxmfacc 2
|
|
xxmfacc 3
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x8
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
xxpermdi vs40, vs8, vs9, 0b01
|
|
xxpermdi vs41, vs8, vs9, 0b10
|
|
xxpermdi vs42, vs10, vs11, 0b01
|
|
xxpermdi vs43, vs10, vs11, 0b10
|
|
xxpermdi vs44, vs12, vs13, 0b01
|
|
xxpermdi vs45, vs12, vs13, 0b10
|
|
xxpermdi vs46, vs14, vs15, 0b01
|
|
xxpermdi vs47, vs14, vs15, 0b10
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxlor vs0, vs32, vs32
|
|
xxlor vs1, vs33, vs33
|
|
xxlor vs2, vs34, vs34
|
|
xxlor vs3, vs35, vs35
|
|
xxlor vs4, vs36, vs36
|
|
xxlor vs5, vs37, vs37
|
|
xxlor vs6, vs38, vs38
|
|
xxlor vs7, vs39, vs39
|
|
xxlor vs8, vs40, vs40
|
|
xxlor vs9, vs41, vs41
|
|
xxlor vs10, vs42, vs42
|
|
xxlor vs11, vs43, vs43
|
|
xxlor vs12, vs44, vs44
|
|
xxlor vs13, vs45, vs45
|
|
xxlor vs14, vs46, vs46
|
|
xxlor vs15, vs47, vs47
|
|
#else
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
xxlor vs10, vs40, vs40
|
|
xxlor vs11, vs41, vs41
|
|
xxlor vs8, vs42, vs42
|
|
xxlor vs9, vs43, vs43
|
|
xxlor vs14, vs44, vs44
|
|
xxlor vs15, vs45, vs45
|
|
xxlor vs12, vs46, vs46
|
|
xxlor vs13, vs47, vs47
|
|
#endif
|
|
SAVE8 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,vs8,vs9,vs10,vs11,vs12,vs13,vs14,vs15,CO,0
|
|
addi CO, CO, 128
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=1 and M=4
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL1x4_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
xxsetaccz 1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x4_2 Index,IsLast
|
|
lxvp vs32, DISP8(\Index, 0)(AO) // load real,imag from A
|
|
lxvp vs34, DISP8(\Index, 32)(AO) // load real,imag from A
|
|
lxvp vs40, DISP8(\Index, 64)(AO) // load real,imag from A
|
|
lxvp vs42, DISP8(\Index, 96)(AO) // load real,imag from A
|
|
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
xvf64gerpp 0, vs40, vs49
|
|
xvf64gerpp 1, vs42, vs49
|
|
#else
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 1, vs34, vs49
|
|
xvf64gerpp 0, vs40, vs48
|
|
xvf64gerpp 1, vs42, vs48
|
|
#endif
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP8(\Index,128)
|
|
addi BO, BO, DISP2(\Index,32)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_1x4 OffsetA,OffsetB
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxvp vs34, 32(AO) // load real,imag from A
|
|
lxv vs48, 0(BO) // load real imag from B
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 1, vs34, vs48
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x4_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
xxmfacc 1
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x4
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
xxpermdi vs36, vs4, vs5, 0b01
|
|
xxpermdi vs37, vs4, vs5, 0b10
|
|
xxpermdi vs38, vs6, vs7, 0b01
|
|
xxpermdi vs39, vs6, vs7, 0b10
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxlor vs0, vs32, vs32
|
|
xxlor vs1, vs33, vs33
|
|
xxlor vs2, vs34, vs34
|
|
xxlor vs3, vs35, vs35
|
|
xxlor vs4, vs36, vs36
|
|
xxlor vs5, vs37, vs37
|
|
xxlor vs6, vs38, vs38
|
|
xxlor vs7, vs39, vs39
|
|
#else
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
xxlor vs6, vs36, vs36
|
|
xxlor vs7, vs37, vs37
|
|
xxlor vs4, vs38, vs38
|
|
xxlor vs5, vs39, vs39
|
|
#endif
|
|
SAVE4 vs0,vs1,vs2,vs3,vs4,vs5,vs6,vs7,CO,0
|
|
addi CO, CO, 64
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=1 and M=2
|
|
**********************************************************************************************/
|
|
|
|
.macro KERNEL1x2_ZERO_AND_PRIME_MMA
|
|
/* zero out and prime the MMA accumulators */
|
|
xxsetaccz 0
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x2_2 Index,IsLast
|
|
lxvp vs32, DISP4(\Index, 0)(AO) // load real,imag from A
|
|
lxvp vs40, DISP4(\Index, 32)(AO) // load real,imag from A
|
|
lxvp vs48, DISP2(\Index, 0)(BO) // load real imag from B
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xvf64gerpp 0, vs32, vs48
|
|
xvf64gerpp 0, vs40, vs49
|
|
#else
|
|
xvf64gerpp 0, vs32, vs49
|
|
xvf64gerpp 0, vs40, vs48
|
|
#endif
|
|
.if \IsLast==1
|
|
addi AO, AO, DISP4(\Index,64)
|
|
addi BO, BO, DISP2(\Index,32)
|
|
.endif
|
|
.endm
|
|
|
|
|
|
.macro LOAD_END_1x2 OffsetA,OffsetB
|
|
lxvp vs32, 0(AO) // load real,imag from A
|
|
lxv vs48, 0(BO) // load real imag from B
|
|
xvf64gerpp 0, vs32, vs48
|
|
addi BO, BO, \OffsetB
|
|
addi AO, AO, \OffsetA
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x2_UNPRIME_MMA
|
|
/* "unprime" MMA accumulators */
|
|
xxmfacc 0
|
|
.endm
|
|
|
|
|
|
.macro SAVE1x2
|
|
xxpermdi vs32, vs0, vs1, 0b01
|
|
xxpermdi vs33, vs0, vs1, 0b10
|
|
xxpermdi vs34, vs2, vs3, 0b01
|
|
xxpermdi vs35, vs2, vs3, 0b10
|
|
#if (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
|
|
xxlor vs0, vs32, vs32
|
|
xxlor vs1, vs33, vs33
|
|
xxlor vs2, vs34, vs34
|
|
xxlor vs3, vs35, vs35
|
|
#else
|
|
xxlor vs2, vs32, vs32
|
|
xxlor vs3, vs33, vs33
|
|
xxlor vs0, vs34, vs34
|
|
xxlor vs1, vs35, vs35
|
|
#endif
|
|
|
|
SAVE2 vs0,vs1,vs2,vs3,CO,0
|
|
addi CO, CO, 32
|
|
.endm
|
|
|
|
/**********************************************************************************************
|
|
*
|
|
|
|
.macros for N=1 and M=1
|
|
**********************************************************************************************/
|
|
|
|
.macro ZERO1x1
|
|
xxlxor vs0, vs0, vs0
|
|
xxlxor vs1, vs1, vs1
|
|
.endm
|
|
|
|
|
|
.macro LOAD1x1
|
|
LOAD1x1O 0,0
|
|
.endm
|
|
|
|
|
|
.macro LOAD1x1O OffsetA,OffsetB
|
|
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
|
|
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
|
|
xxswapd vs49, vs48
|
|
|
|
.endm
|
|
|
|
|
|
.macro END1x1_WITHOUT_ADD
|
|
END1x1 AO,BO,0,0
|
|
.endm
|
|
|
|
|
|
.macro END1x1 AREG, BREG, OffsetA, OffsetB
|
|
.if \OffsetB != 0
|
|
addi \BREG, \BREG, \OffsetB
|
|
.endif
|
|
.if \OffsetA != 0
|
|
addi \AREG, \AREG, \OffsetA
|
|
.endif
|
|
xvmaddadp vs0, vs32, vs48
|
|
xvmaddadp vs1, vs32, vs49
|
|
.endm
|
|
|
|
|
|
.macro LOAD1x1_2
|
|
LOAD1x1_2O 0,0
|
|
.endm
|
|
|
|
|
|
.macro LOAD1x1_2O OffsetA,OffsetB
|
|
lxv vs48,(\OffsetB+ 0)(BO) // load real imag from B
|
|
lxv vs52, (\OffsetB+16)(BO) // load real,imag from B
|
|
xxswapd vs49, vs48
|
|
|
|
lxv vs32, (0+\OffsetA)(AO) // load real,imag from A
|
|
lxv vs40, (16+\OffsetA)(AO) // load real,imag from A
|
|
.endm
|
|
|
|
|
|
.macro END1x1_2
|
|
/*for load2 offset will be 32 and 32*/
|
|
KERNEL1x1_2 AO,BO, 32,32,0 ,1,1
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL1x1_E2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,1
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x1_L2 OffsetA,OffsetB, Index,IsLast
|
|
KERNEL1x1_2 AO,BO, \OffsetA,\OffsetB, \Index,\IsLast ,0
|
|
.endm
|
|
|
|
|
|
.macro KERNEL1x1_2 AREG,BREG, OffsetA,OffsetB, Index,IsLast ,Complete
|
|
xxswapd vs53, vs52
|
|
xvmaddadp vs0, vs32, vs48
|
|
xvmaddadp vs1, vs32, vs49
|
|
.if \Complete==0
|
|
lxv vs32, DISP2(\Index, 0 + \OffsetA)(\AREG) // load real,imag from A
|
|
.endif
|
|
.if \Complete==0
|
|
lxv vs48, DISP2(\Index, 0+\OffsetB)(\BREG) // load real imag from B
|
|
.endif
|
|
.if \Complete==0
|
|
xxswapd vs49, vs48
|
|
.endif
|
|
xvmaddadp vs0, vs40, vs52
|
|
xvmaddadp vs1, vs40, vs53
|
|
.if \Complete==0
|
|
lxv vs40, DISP2(\Index,16+0+ \OffsetA)(\AREG) // load real,imag from A
|
|
.endif
|
|
|
|
.if \Complete==0
|
|
lxv vs52, DISP2(\Index, 16+\OffsetB)(\BREG) // load real,imag from B
|
|
.endif
|
|
.if \IsLast==1
|
|
.if \Complete==1
|
|
addi \AREG, \AREG, DISP2(\Index,\OffsetA)
|
|
addi \BREG, \BREG, DISP2(\Index,\OffsetB)
|
|
.else
|
|
addi \AREG, \AREG, DISP2(\Index,32)
|
|
addi \BREG, \BREG, DISP2(\Index,32)
|
|
.endif
|
|
.endif
|
|
.endm
|
|
|
|
|
|
|
|
.macro KERNEL1x1
|
|
LOAD1x1
|
|
END1x1 AO, BO, 16,16
|
|
.endm
|
|
|
|
|
|
|
|
.macro SAVE1x1
|
|
SAVE1 vs0,vs1,CO,0
|
|
addi CO, CO, 16
|
|
.endm
|
|
|
|
/****************************TRMM POINTER REFRESH
|
|
|
|
.macroSES*************************/
|
|
|
|
|
|
.macro SHIFT_REG REG1,REG2,SHIFT_VAL
|
|
.if \SHIFT_VAL==16
|
|
slwi \REG1, \REG2, 8
|
|
.elseif \SHIFT_VAL==8
|
|
slwi \REG1, \REG2, 7
|
|
.elseif \SHIFT_VAL==4
|
|
slwi \REG1, \REG2, 6
|
|
.elseif \SHIFT_VAL==2
|
|
slwi \REG1, \REG2, 5
|
|
.elseif \SHIFT_VAL==1
|
|
slwi \REG1, \REG2, 4
|
|
.endif
|
|
.endm
|
|
/*
|
|
//#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
// ptrbb = bb;
|
|
// #else
|
|
// ptrba += off*16;
|
|
// ptrbb = bb + off*2;
|
|
// #endif
|
|
*/
|
|
|
|
|
|
.macro REFRESH_POINTERS PTR_A,PTR_B,OFF_VAL,B_VAL,C_A,C_B
|
|
#if (defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
/* ptrbb = bb;*/
|
|
mr \PTR_B,\B_VAL /* refresh BPOINT */
|
|
#else
|
|
/*
|
|
// ptrba =ptrba+ off*C_A;
|
|
// ptrbb = bb + off*C_B;
|
|
*/
|
|
SHIFT_REG T4,\OFF_VAL,\C_B /* Number of values in B shifted */
|
|
SHIFT_REG T2,\OFF_VAL,\C_A /* Number of values in A shifted */
|
|
add \PTR_B, \B_VAL , T4 /* Add values to BO */
|
|
add \PTR_A, \PTR_A, T2 /* Add values to AO */
|
|
#endif
|
|
.endm
|
|
|
|
/*
|
|
// #if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
// temp = bk-off;
|
|
// #elif defined(LEFT)
|
|
// temp = off+16; // number of values in A
|
|
// #else
|
|
// temp = off+2; // number of values in B
|
|
// #endif
|
|
*/
|
|
|
|
|
|
.macro REFRESH_TEMP_BK TEMP_BK,BK_VAL,OFF_VAL,INCR_A,INCR_B
|
|
#if (defined(LEFT) && !defined(TRANSA)) || (!defined(LEFT) && defined(TRANSA))
|
|
/* temp = bk-off;*/
|
|
sub \TEMP_BK,\BK_VAL,\OFF_VAL
|
|
#elif defined(LEFT)
|
|
/* temp = off+INCR_A; // number of values in A */
|
|
addi \TEMP_BK, \OFF_VAL, \INCR_A
|
|
#else
|
|
/* temp = off+INCR_B // number of values in B*/
|
|
addi \TEMP_BK,\OFF_VAL, \INCR_B
|
|
#endif
|
|
.endm
|
|
/*
|
|
// #if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
// temp = bk - off;
|
|
// #ifdef LEFT
|
|
// temp -= 16; // number of values in A
|
|
// #else
|
|
// temp -= 2; // number of values in B
|
|
// #endif
|
|
// ptrba += temp*16;
|
|
// ptrbb += temp*2;
|
|
// #endif
|
|
// #ifdef LEFT
|
|
// off += 16; // number of values in A
|
|
// #endif
|
|
*/
|
|
|
|
|
|
|
|
.macro REFRESH_AFTER_SAVE TEMP_BK,BK_VAL,OFF_VAL,PTR_B,PTR_A,C_A,C_B
|
|
#if ( defined(LEFT) && defined(TRANSA)) || (!defined(LEFT) && !defined(TRANSA))
|
|
/*temp = bk - off;*/
|
|
sub \TEMP_BK,\BK_VAL,\OFF_VAL
|
|
#ifdef LEFT
|
|
/*temp -= 8; // number of values in A*/
|
|
addi \TEMP_BK,\TEMP_BK,-\C_A
|
|
#else
|
|
/*temp -= 4; // number of values in B*/
|
|
addi \TEMP_BK,\TEMP_BK,-\C_B
|
|
#endif
|
|
/*ptrba += temp*C_A;
|
|
ptrbb += temp*C_B;*/
|
|
SHIFT_REG T4,\TEMP_BK,\C_A
|
|
SHIFT_REG T2,\TEMP_BK,\C_B
|
|
add \PTR_A, \PTR_A,T4/*ptrba+temp*C_A*/
|
|
add \PTR_B, \PTR_B,T2
|
|
#endif
|
|
#ifdef LEFT
|
|
/*off += 8; // number of values in A*/
|
|
addi \OFF_VAL,\OFF_VAL,\C_A
|
|
#endif
|
|
.endm
|
|
|