OpenBLAS/kernel/power/zgemm_macros_8x2_power8.S

3694 lines
98 KiB
ArmAsm

/***************************************************************************
Copyright (c) 2013-2016, The OpenBLAS Project
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
1. Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in
the documentation and/or other materials provided with the
distribution.
3. Neither the name of the OpenBLAS project nor the names of
its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
/**************************************************************************************
* 2016/04/22 Werner Saar (wernsaar@googlemail.com)
* BLASTEST : OK
* CTEST : OK
* TEST : OK
* LAPACK-TEST : OK
**************************************************************************************/
#if defined(NN) || defined(NT) || defined(TN) || defined(TT)
#define XSFADD_R1 xsadddp
#define XSFADD_R2 xssubdp
#define XSFADD_I1 xsadddp
#define XSFADD_I2 xsadddp
#elif defined(CN) || defined(CT) || defined(RN) || defined(RT)
#define XSFADD_R1 xsadddp
#define XSFADD_R2 xsadddp
#define XSFADD_I1 xssubdp
#define XSFADD_I2 xsadddp
#elif defined(NC) || defined(TC) || defined(NR) || defined(TR)
#define XSFADD_R1 xsadddp
#define XSFADD_R2 xsadddp
#define XSFADD_I1 xsadddp
#define XSFADD_I2 xssubdp
#else // CC || CR || RC || RR
#define XSFADD_R1 xsadddp
#define XSFADD_R2 xssubdp
#define XSFADD_I1 xssubdp
#define XSFADD_I2 xssubdp
#endif
/**********************************************************************************************
* Macros for N=2 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`LOAD2x8_1', `
#else
.macro LOAD2x8_1
#endif
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x8_I1', `
#else
.macro KERNEL2x8_I1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs2, vs16 // real*real, imag*real
xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
xvmuldp vs38, vs3, vs16 // real*real, imag*real
xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
xvmuldp vs40, vs4, vs16 // real*real, imag*real
xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
xvmuldp vs42, vs5, vs16 // real*real, imag*real
xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
xvmuldp vs44, vs6, vs16 // real*real, imag*real
xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
xvmuldp vs46, vs7, vs16 // real*real, imag*real
xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
xvmuldp vs48, vs0, vs18 // real*real, imag*real
xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
xvmuldp vs50, vs1, vs18 // real*real, imag*real
xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
xvmuldp vs52, vs2, vs18 // real*real, imag*real
xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
xvmuldp vs54, vs3, vs18 // real*real, imag*real
xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
xvmuldp vs56, vs4, vs18 // real*real, imag*real
xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
xvmuldp vs58, vs5, vs18 // real*real, imag*real
xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
xvmuldp vs60, vs6, vs18 // real*real, imag*real
xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
xvmuldp vs62, vs7, vs18 // real*real, imag*real
xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x8_1', `
#else
.macro KERNEL2x8_1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
xvmaddadp vs40, vs4, vs16 // real*real, imag*real
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
xvmaddadp vs42, vs5, vs16 // real*real, imag*real
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
xvmaddadp vs44, vs6, vs16 // real*real, imag*real
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
xvmaddadp vs46, vs7, vs16 // real*real, imag*real
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
xvmaddadp vs48, vs0, vs18 // real*real, imag*real
xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs50, vs1, vs18 // real*real, imag*real
xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
xvmaddadp vs52, vs2, vs18 // real*real, imag*real
xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
xvmaddadp vs54, vs3, vs18 // real*real, imag*real
xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
xvmaddadp vs56, vs4, vs18 // real*real, imag*real
xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
xvmaddadp vs58, vs5, vs18 // real*real, imag*real
xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
xvmaddadp vs60, vs6, vs18 // real*real, imag*real
xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
xvmaddadp vs62, vs7, vs18 // real*real, imag*real
xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x8_2', `
#else
.macro KERNEL2x8_2
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
xvmaddadp vs40, vs12, vs20 // real*real, imag*real
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
xvmaddadp vs42, vs13, vs20 // real*real, imag*real
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
xvmaddadp vs44, vs14, vs20 // real*real, imag*real
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
xvmaddadp vs46, vs15, vs20 // real*real, imag*real
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
xvmaddadp vs48, vs8, vs22 // real*real, imag*real
xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs50, vs9, vs22 // real*real, imag*real
xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
xvmaddadp vs52, vs10, vs22 // real*real, imag*real
xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
xvmaddadp vs54, vs11, vs22 // real*real, imag*real
xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
xvmaddadp vs56, vs12, vs22 // real*real, imag*real
xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
xvmaddadp vs58, vs13, vs22 // real*real, imag*real
xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
xvmaddadp vs60, vs14, vs22 // real*real, imag*real
xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
xvmaddadp vs62, vs15, vs22 // real*real, imag*real
xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x8_E2', `
#else
.macro KERNEL2x8_E2
#endif
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
xvmaddadp vs40, vs12, vs20 // real*real, imag*real
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
xvmaddadp vs42, vs13, vs20 // real*real, imag*real
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
xvmaddadp vs44, vs14, vs20 // real*real, imag*real
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
xvmaddadp vs46, vs15, vs20 // real*real, imag*real
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
xvmaddadp vs48, vs8, vs22 // real*real, imag*real
xvmaddadp vs49, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs50, vs9, vs22 // real*real, imag*real
xvmaddadp vs51, vs9, vs23 // real*imag, imag*imag
xvmaddadp vs52, vs10, vs22 // real*real, imag*real
xvmaddadp vs53, vs10, vs23 // real*imag, imag*imag
xvmaddadp vs54, vs11, vs22 // real*real, imag*real
xvmaddadp vs55, vs11, vs23 // real*imag, imag*imag
xvmaddadp vs56, vs12, vs22 // real*real, imag*real
xvmaddadp vs57, vs12, vs23 // real*imag, imag*imag
xvmaddadp vs58, vs13, vs22 // real*real, imag*real
xvmaddadp vs59, vs13, vs23 // real*imag, imag*imag
xvmaddadp vs60, vs14, vs22 // real*real, imag*real
xvmaddadp vs61, vs14, vs23 // real*imag, imag*imag
xvmaddadp vs62, vs15, vs22 // real*real, imag*real
xvmaddadp vs63, vs15, vs23 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x8_SUBI1', `
#else
.macro KERNEL2x8_SUBI1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs2, vs16 // real*real, imag*real
xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
xvmuldp vs38, vs3, vs16 // real*real, imag*real
xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
xvmuldp vs40, vs4, vs16 // real*real, imag*real
xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
xvmuldp vs42, vs5, vs16 // real*real, imag*real
xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
xvmuldp vs44, vs6, vs16 // real*real, imag*real
xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
xvmuldp vs46, vs7, vs16 // real*real, imag*real
xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
xvmuldp vs48, vs0, vs18 // real*real, imag*real
xvmuldp vs49, vs0, vs19 // real*imag, imag*imag
xvmuldp vs50, vs1, vs18 // real*real, imag*real
xvmuldp vs51, vs1, vs19 // real*imag, imag*imag
xvmuldp vs52, vs2, vs18 // real*real, imag*real
xvmuldp vs53, vs2, vs19 // real*imag, imag*imag
xvmuldp vs54, vs3, vs18 // real*real, imag*real
xvmuldp vs55, vs3, vs19 // real*imag, imag*imag
xvmuldp vs56, vs4, vs18 // real*real, imag*real
xvmuldp vs57, vs4, vs19 // real*imag, imag*imag
xvmuldp vs58, vs5, vs18 // real*real, imag*real
xvmuldp vs59, vs5, vs19 // real*imag, imag*imag
xvmuldp vs60, vs6, vs18 // real*real, imag*real
xvmuldp vs61, vs6, vs19 // real*imag, imag*imag
xvmuldp vs62, vs7, vs18 // real*real, imag*real
xvmuldp vs63, vs7, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x8_SUB1', `
#else
.macro KERNEL2x8_SUB1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
xvmaddadp vs40, vs4, vs16 // real*real, imag*real
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
xvmaddadp vs42, vs5, vs16 // real*real, imag*real
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
xvmaddadp vs44, vs6, vs16 // real*real, imag*real
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
xvmaddadp vs46, vs7, vs16 // real*real, imag*real
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
xvmaddadp vs48, vs0, vs18 // real*real, imag*real
xvmaddadp vs49, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs50, vs1, vs18 // real*real, imag*real
xvmaddadp vs51, vs1, vs19 // real*imag, imag*imag
xvmaddadp vs52, vs2, vs18 // real*real, imag*real
xvmaddadp vs53, vs2, vs19 // real*imag, imag*imag
xvmaddadp vs54, vs3, vs18 // real*real, imag*real
xvmaddadp vs55, vs3, vs19 // real*imag, imag*imag
xvmaddadp vs56, vs4, vs18 // real*real, imag*real
xvmaddadp vs57, vs4, vs19 // real*imag, imag*imag
xvmaddadp vs58, vs5, vs18 // real*real, imag*real
xvmaddadp vs59, vs5, vs19 // real*imag, imag*imag
xvmaddadp vs60, vs6, vs18 // real*real, imag*real
xvmaddadp vs61, vs6, vs19 // real*imag, imag*imag
xvmaddadp vs62, vs7, vs18 // real*real, imag*real
xvmaddadp vs63, vs7, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`SAVE2x8', `
#else
.macro SAVE2x8
#endif
mr T1, CO
addi T2, T1, 64
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
lxvd2x vs17, o16, T1
lxvd2x vs18, o32, T1
lxvd2x vs19, o48, T1
lxvd2x vs20, o0, T2
lxvd2x vs21, o16, T2
lxvd2x vs22, o32, T2
lxvd2x vs23, o48, T2
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs32 // realA*realB
XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs32 // realA*imagB
XSFADD_I2 vs1, vs1, vs33 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs34 // realA*realB
XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs34 // realA*imagB
XSFADD_I2 vs1, vs1, vs35 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs36 // realA*realB
XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs36 // realA*imagB
XSFADD_I2 vs1, vs1, vs37 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs38 // realA*realB
XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs38 // realA*imagB
XSFADD_I2 vs1, vs1, vs39 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs40 // realA*realB
XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs40 // realA*imagB
XSFADD_I2 vs1, vs1, vs41 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs42 // realA*realB
XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs42 // realA*imagB
XSFADD_I2 vs1, vs1, vs43 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs44 // realA*realB
XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs44 // realA*imagB
XSFADD_I2 vs1, vs1, vs45 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs46 // realA*realB
XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs46 // realA*imagB
XSFADD_I2 vs1, vs1, vs47 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
xvadddp vs9, vs9, vs17
xvadddp vs10, vs10, vs18
xvadddp vs11, vs11, vs19
xvadddp vs12, vs12, vs20
xvadddp vs13, vs13, vs21
xvadddp vs14, vs14, vs22
xvadddp vs15, vs15, vs23
#endif
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
stxvd2x vs12, o0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
add T1, T1, LDC
add T2, T2, LDC
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
lxvd2x vs17, o16, T1
lxvd2x vs18, o32, T1
lxvd2x vs19, o48, T1
lxvd2x vs20, o0, T2
lxvd2x vs21, o16, T2
lxvd2x vs22, o32, T2
lxvd2x vs23, o48, T2
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs49,vs49) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs48 // realA*realB
XSFADD_R2 vs0, vs0, vs49 // imagA*imagB
XXSWAPD(vs48,vs48) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs49,vs49) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs48 // realA*imagB
XSFADD_I2 vs1, vs1, vs49 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs51,vs51) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs50 // realA*realB
XSFADD_R2 vs0, vs0, vs51 // imagA*imagB
XXSWAPD(vs50,vs50) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs51,vs51) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs50 // realA*imagB
XSFADD_I2 vs1, vs1, vs51 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs53,vs53) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs52 // realA*realB
XSFADD_R2 vs0, vs0, vs53 // imagA*imagB
XXSWAPD(vs52,vs52) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs53,vs53) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs52 // realA*imagB
XSFADD_I2 vs1, vs1, vs53 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs55,vs55) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs54 // realA*realB
XSFADD_R2 vs0, vs0, vs55 // imagA*imagB
XXSWAPD(vs54,vs54) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs55,vs55) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs54 // realA*imagB
XSFADD_I2 vs1, vs1, vs55 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs57,vs57) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs56 // realA*realB
XSFADD_R2 vs0, vs0, vs57 // imagA*imagB
XXSWAPD(vs56,vs56) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs57,vs57) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs56 // realA*imagB
XSFADD_I2 vs1, vs1, vs57 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs59,vs59) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs58 // realA*realB
XSFADD_R2 vs0, vs0, vs59 // imagA*imagB
XXSWAPD(vs58,vs58) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs59,vs59) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs58 // realA*imagB
XSFADD_I2 vs1, vs1, vs59 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs61,vs61) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs60 // realA*realB
XSFADD_R2 vs0, vs0, vs61 // imagA*imagB
XXSWAPD(vs60,vs60) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs61,vs61) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs60 // realA*imagB
XSFADD_I2 vs1, vs1, vs61 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs63,vs63) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs62 // realA*realB
XSFADD_R2 vs0, vs0, vs63 // imagA*imagB
XXSWAPD(vs62,vs62) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs63,vs63) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs62 // realA*imagB
XSFADD_I2 vs1, vs1, vs63 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
xvadddp vs9, vs9, vs17
xvadddp vs10, vs10, vs18
xvadddp vs11, vs11, vs19
xvadddp vs12, vs12, vs20
xvadddp vs13, vs13, vs21
xvadddp vs14, vs14, vs22
xvadddp vs15, vs15, vs23
#endif
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
stxvd2x vs12, o0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
add T1, T1, LDC
add T2, T2, LDC
addi CO, CO, 128
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`LOAD2x4_1', `
#else
.macro LOAD2x4_1
#endif
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x4_I1', `
#else
.macro KERNEL2x4_I1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs2, vs16 // real*real, imag*real
xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
xvmuldp vs38, vs3, vs16 // real*real, imag*real
xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
xvmuldp vs40, vs0, vs18 // real*real, imag*real
xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
xvmuldp vs42, vs1, vs18 // real*real, imag*real
xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
xvmuldp vs44, vs2, vs18 // real*real, imag*real
xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
xvmuldp vs46, vs3, vs18 // real*real, imag*real
xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x4_1', `
#else
.macro KERNEL2x4_1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
xvmaddadp vs40, vs0, vs18 // real*real, imag*real
xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs42, vs1, vs18 // real*real, imag*real
xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
xvmaddadp vs44, vs2, vs18 // real*real, imag*real
xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
xvmaddadp vs46, vs3, vs18 // real*real, imag*real
xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x4_2', `
#else
.macro KERNEL2x4_2
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
xvmaddadp vs40, vs8, vs22 // real*real, imag*real
xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs42, vs9, vs22 // real*real, imag*real
xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
xvmaddadp vs44, vs10, vs22 // real*real, imag*real
xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
xvmaddadp vs46, vs11, vs22 // real*real, imag*real
xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x4_E2', `
#else
.macro KERNEL2x4_E2
#endif
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
xvmaddadp vs40, vs8, vs22 // real*real, imag*real
xvmaddadp vs41, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs42, vs9, vs22 // real*real, imag*real
xvmaddadp vs43, vs9, vs23 // real*imag, imag*imag
xvmaddadp vs44, vs10, vs22 // real*real, imag*real
xvmaddadp vs45, vs10, vs23 // real*imag, imag*imag
xvmaddadp vs46, vs11, vs22 // real*real, imag*real
xvmaddadp vs47, vs11, vs23 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x4_SUBI1', `
#else
.macro KERNEL2x4_SUBI1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs2, vs16 // real*real, imag*real
xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
xvmuldp vs38, vs3, vs16 // real*real, imag*real
xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
xvmuldp vs40, vs0, vs18 // real*real, imag*real
xvmuldp vs41, vs0, vs19 // real*imag, imag*imag
xvmuldp vs42, vs1, vs18 // real*real, imag*real
xvmuldp vs43, vs1, vs19 // real*imag, imag*imag
xvmuldp vs44, vs2, vs18 // real*real, imag*real
xvmuldp vs45, vs2, vs19 // real*imag, imag*imag
xvmuldp vs46, vs3, vs18 // real*real, imag*real
xvmuldp vs47, vs3, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x4_SUB1', `
#else
.macro KERNEL2x4_SUB1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
xvmaddadp vs40, vs0, vs18 // real*real, imag*real
xvmaddadp vs41, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs42, vs1, vs18 // real*real, imag*real
xvmaddadp vs43, vs1, vs19 // real*imag, imag*imag
xvmaddadp vs44, vs2, vs18 // real*real, imag*real
xvmaddadp vs45, vs2, vs19 // real*imag, imag*imag
xvmaddadp vs46, vs3, vs18 // real*real, imag*real
xvmaddadp vs47, vs3, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`SAVE2x4', `
#else
.macro SAVE2x4
#endif
mr T1, CO
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
lxvd2x vs17, o16, T1
lxvd2x vs18, o32, T1
lxvd2x vs19, o48, T1
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs32 // realA*realB
XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs32 // realA*imagB
XSFADD_I2 vs1, vs1, vs33 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs34 // realA*realB
XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs34 // realA*imagB
XSFADD_I2 vs1, vs1, vs35 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs36 // realA*realB
XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs36 // realA*imagB
XSFADD_I2 vs1, vs1, vs37 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs38 // realA*realB
XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs38 // realA*imagB
XSFADD_I2 vs1, vs1, vs39 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
xvadddp vs9, vs9, vs17
xvadddp vs10, vs10, vs18
xvadddp vs11, vs11, vs19
#endif
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
add T1, T1, LDC
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
lxvd2x vs17, o16, T1
lxvd2x vs18, o32, T1
lxvd2x vs19, o48, T1
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs40 // realA*realB
XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs40 // realA*imagB
XSFADD_I2 vs1, vs1, vs41 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs42 // realA*realB
XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs42 // realA*imagB
XSFADD_I2 vs1, vs1, vs43 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs44 // realA*realB
XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs44 // realA*imagB
XSFADD_I2 vs1, vs1, vs45 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs46 // realA*realB
XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs46 // realA*imagB
XSFADD_I2 vs1, vs1, vs47 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
xvadddp vs9, vs9, vs17
xvadddp vs10, vs10, vs18
xvadddp vs11, vs11, vs19
#endif
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
add T1, T1, LDC
addi CO, CO, 64
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`LOAD2x2_1', `
#else
.macro LOAD2x2_1
#endif
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
addi AO, AO, 32
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x2_I1', `
#else
.macro KERNEL2x2_I1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs0, vs18 // real*real, imag*real
xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
xvmuldp vs38, vs1, vs18 // real*real, imag*real
xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x2_1', `
#else
.macro KERNEL2x2_1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs0, vs18 // real*real, imag*real
xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs38, vs1, vs18 // real*real, imag*real
xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x2_2', `
#else
.macro KERNEL2x2_2
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs8, vs22 // real*real, imag*real
xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs38, vs9, vs22 // real*real, imag*real
xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x2_E2', `
#else
.macro KERNEL2x2_E2
#endif
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs8, vs22 // real*real, imag*real
xvmaddadp vs37, vs8, vs23 // real*imag, imag*imag
xvmaddadp vs38, vs9, vs22 // real*real, imag*real
xvmaddadp vs39, vs9, vs23 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x2_SUBI1', `
#else
.macro KERNEL2x2_SUBI1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs0, vs18 // real*real, imag*real
xvmuldp vs37, vs0, vs19 // real*imag, imag*imag
xvmuldp vs38, vs1, vs18 // real*real, imag*real
xvmuldp vs39, vs1, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x2_SUB1', `
#else
.macro KERNEL2x2_SUB1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs0, vs18 // real*real, imag*real
xvmaddadp vs37, vs0, vs19 // real*imag, imag*imag
xvmaddadp vs38, vs1, vs18 // real*real, imag*real
xvmaddadp vs39, vs1, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`SAVE2x2', `
#else
.macro SAVE2x2
#endif
mr T1, CO
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
lxvd2x vs17, o16, T1
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs32 // realA*realB
XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs32 // realA*imagB
XSFADD_I2 vs1, vs1, vs33 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs34 // realA*realB
XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs34 // realA*imagB
XSFADD_I2 vs1, vs1, vs35 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
xvadddp vs9, vs9, vs17
#endif
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
add T1, T1, LDC
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
lxvd2x vs17, o16, T1
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs36 // realA*realB
XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs36 // realA*imagB
XSFADD_I2 vs1, vs1, vs37 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs38 // realA*realB
XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs38 // realA*imagB
XSFADD_I2 vs1, vs1, vs39 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
xvadddp vs9, vs9, vs17
#endif
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
add T1, T1, LDC
addi CO, CO, 32
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=2 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`LOAD2x1_1', `
#else
.macro LOAD2x1_1
#endif
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
lxvd2x vs0, o0, AO // load real,imag from A
addi AO, AO, 16
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x1_I1', `
#else
.macro KERNEL2x1_I1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs0, vs18 // real*real, imag*real
xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x1_1', `
#else
.macro KERNEL2x1_1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
lxvd2x vs22, o32, BO // load real part from B
lxvd2x vs23, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs0, vs18 // real*real, imag*real
xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x1_2', `
#else
.macro KERNEL2x1_2
#endif
lxvd2x vs0, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs8, vs22 // real*real, imag*real
xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x1_E2', `
#else
.macro KERNEL2x1_E2
#endif
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs8, vs22 // real*real, imag*real
xvmaddadp vs35, vs8, vs23 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x1_SUBI1', `
#else
.macro KERNEL2x1_SUBI1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs0, vs18 // real*real, imag*real
xvmuldp vs35, vs0, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL2x1_SUB1', `
#else
.macro KERNEL2x1_SUB1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
lxvd2x vs18, o32, BO // load real part from B
lxvd2x vs19, o48, BO // load imag part from B
addi BO, BO, 64
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs0, vs18 // real*real, imag*real
xvmaddadp vs35, vs0, vs19 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`SAVE2x1', `
#else
.macro SAVE2x1
#endif
mr T1, CO
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs32 // realA*realB
XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs32 // realA*imagB
XSFADD_I2 vs1, vs1, vs33 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
#endif
stxvd2x vs8, o0, T1
add T1, T1, LDC
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs34 // realA*realB
XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs34 // realA*imagB
XSFADD_I2 vs1, vs1, vs35 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
#endif
stxvd2x vs8, o0, T1
add T1, T1, LDC
addi CO, CO, 16
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=8
**********************************************************************************************/
#if defined(_AIX)
define(`LOAD1x8_1', `
#else
.macro LOAD1x8_1
#endif
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x8_I1', `
#else
.macro KERNEL1x8_I1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs2, vs16 // real*real, imag*real
xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
xvmuldp vs38, vs3, vs16 // real*real, imag*real
xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
xvmuldp vs40, vs4, vs16 // real*real, imag*real
xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
xvmuldp vs42, vs5, vs16 // real*real, imag*real
xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
xvmuldp vs44, vs6, vs16 // real*real, imag*real
xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
xvmuldp vs46, vs7, vs16 // real*real, imag*real
xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x8_1', `
#else
.macro KERNEL1x8_1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs12, o0, AO // load real,imag from A
lxvd2x vs13, o16, AO // load real,imag from A
lxvd2x vs14, o32, AO // load real,imag from A
lxvd2x vs15, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
xvmaddadp vs40, vs4, vs16 // real*real, imag*real
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
xvmaddadp vs42, vs5, vs16 // real*real, imag*real
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
xvmaddadp vs44, vs6, vs16 // real*real, imag*real
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
xvmaddadp vs46, vs7, vs16 // real*real, imag*real
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x8_2', `
#else
.macro KERNEL1x8_2
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
xvmaddadp vs40, vs12, vs20 // real*real, imag*real
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
xvmaddadp vs42, vs13, vs20 // real*real, imag*real
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
xvmaddadp vs44, vs14, vs20 // real*real, imag*real
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
xvmaddadp vs46, vs15, vs20 // real*real, imag*real
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x8_E2', `
#else
.macro KERNEL1x8_E2
#endif
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
xvmaddadp vs40, vs12, vs20 // real*real, imag*real
xvmaddadp vs41, vs12, vs21 // real*imag, imag*imag
xvmaddadp vs42, vs13, vs20 // real*real, imag*real
xvmaddadp vs43, vs13, vs21 // real*imag, imag*imag
xvmaddadp vs44, vs14, vs20 // real*real, imag*real
xvmaddadp vs45, vs14, vs21 // real*imag, imag*imag
xvmaddadp vs46, vs15, vs20 // real*real, imag*real
xvmaddadp vs47, vs15, vs21 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x8_SUBI1', `
#else
.macro KERNEL1x8_SUBI1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs2, vs16 // real*real, imag*real
xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
xvmuldp vs38, vs3, vs16 // real*real, imag*real
xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
xvmuldp vs40, vs4, vs16 // real*real, imag*real
xvmuldp vs41, vs4, vs17 // real*imag, imag*imag
xvmuldp vs42, vs5, vs16 // real*real, imag*real
xvmuldp vs43, vs5, vs17 // real*imag, imag*imag
xvmuldp vs44, vs6, vs16 // real*real, imag*real
xvmuldp vs45, vs6, vs17 // real*imag, imag*imag
xvmuldp vs46, vs7, vs16 // real*real, imag*real
xvmuldp vs47, vs7, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x8_SUB1', `
#else
.macro KERNEL1x8_SUB1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs4, o0, AO // load real,imag from A
lxvd2x vs5, o16, AO // load real,imag from A
lxvd2x vs6, o32, AO // load real,imag from A
lxvd2x vs7, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
xvmaddadp vs40, vs4, vs16 // real*real, imag*real
xvmaddadp vs41, vs4, vs17 // real*imag, imag*imag
xvmaddadp vs42, vs5, vs16 // real*real, imag*real
xvmaddadp vs43, vs5, vs17 // real*imag, imag*imag
xvmaddadp vs44, vs6, vs16 // real*real, imag*real
xvmaddadp vs45, vs6, vs17 // real*imag, imag*imag
xvmaddadp vs46, vs7, vs16 // real*real, imag*real
xvmaddadp vs47, vs7, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`SAVE1x8', `
#else
.macro SAVE1x8
#endif
mr T1, CO
addi T2, T1, 64
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
lxvd2x vs17, o16, T1
lxvd2x vs18, o32, T1
lxvd2x vs19, o48, T1
lxvd2x vs20, o0, T2
lxvd2x vs21, o16, T2
lxvd2x vs22, o32, T2
lxvd2x vs23, o48, T2
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs32 // realA*realB
XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs32 // realA*imagB
XSFADD_I2 vs1, vs1, vs33 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs34 // realA*realB
XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs34 // realA*imagB
XSFADD_I2 vs1, vs1, vs35 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs36 // realA*realB
XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs36 // realA*imagB
XSFADD_I2 vs1, vs1, vs37 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs38 // realA*realB
XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs38 // realA*imagB
XSFADD_I2 vs1, vs1, vs39 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs41,vs41) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs40 // realA*realB
XSFADD_R2 vs0, vs0, vs41 // imagA*imagB
XXSWAPD(vs40,vs40) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs41,vs41) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs40 // realA*imagB
XSFADD_I2 vs1, vs1, vs41 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs12, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs43,vs43) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs42 // realA*realB
XSFADD_R2 vs0, vs0, vs43 // imagA*imagB
XXSWAPD(vs42,vs42) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs43,vs43) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs42 // realA*imagB
XSFADD_I2 vs1, vs1, vs43 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs13, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs45,vs45) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs44 // realA*realB
XSFADD_R2 vs0, vs0, vs45 // imagA*imagB
XXSWAPD(vs44,vs44) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs45,vs45) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs44 // realA*imagB
XSFADD_I2 vs1, vs1, vs45 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs14, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs47,vs47) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs46 // realA*realB
XSFADD_R2 vs0, vs0, vs47 // imagA*imagB
XXSWAPD(vs46,vs46) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs47,vs47) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs46 // realA*imagB
XSFADD_I2 vs1, vs1, vs47 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs15, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
xvadddp vs9, vs9, vs17
xvadddp vs10, vs10, vs18
xvadddp vs11, vs11, vs19
xvadddp vs12, vs12, vs20
xvadddp vs13, vs13, vs21
xvadddp vs14, vs14, vs22
xvadddp vs15, vs15, vs23
#endif
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
stxvd2x vs12, o0, T2
stxvd2x vs13, o16, T2
stxvd2x vs14, o32, T2
stxvd2x vs15, o48, T2
add T1, T1, LDC
add T2, T2, LDC
addi CO, CO, 128
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=4
**********************************************************************************************/
#if defined(_AIX)
define(`LOAD1x4_1', `
#else
.macro LOAD1x4_1
#endif
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x4_I1', `
#else
.macro KERNEL1x4_I1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs2, vs16 // real*real, imag*real
xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
xvmuldp vs38, vs3, vs16 // real*real, imag*real
xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x4_1', `
#else
.macro KERNEL1x4_1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
lxvd2x vs10, o32, AO // load real,imag from A
lxvd2x vs11, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x4_2', `
#else
.macro KERNEL1x4_2
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x4_E2', `
#else
.macro KERNEL1x4_E2
#endif
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
xvmaddadp vs36, vs10, vs20 // real*real, imag*real
xvmaddadp vs37, vs10, vs21 // real*imag, imag*imag
xvmaddadp vs38, vs11, vs20 // real*real, imag*real
xvmaddadp vs39, vs11, vs21 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x4_SUBI1', `
#else
.macro KERNEL1x4_SUBI1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
xvmuldp vs36, vs2, vs16 // real*real, imag*real
xvmuldp vs37, vs2, vs17 // real*imag, imag*imag
xvmuldp vs38, vs3, vs16 // real*real, imag*real
xvmuldp vs39, vs3, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x4_SUB1', `
#else
.macro KERNEL1x4_SUB1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
lxvd2x vs2, o32, AO // load real,imag from A
lxvd2x vs3, o48, AO // load real,imag from A
addi AO, AO, 64
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
xvmaddadp vs36, vs2, vs16 // real*real, imag*real
xvmaddadp vs37, vs2, vs17 // real*imag, imag*imag
xvmaddadp vs38, vs3, vs16 // real*real, imag*real
xvmaddadp vs39, vs3, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`SAVE1x4', `
#else
.macro SAVE1x4
#endif
mr T1, CO
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
lxvd2x vs17, o16, T1
lxvd2x vs18, o32, T1
lxvd2x vs19, o48, T1
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs32 // realA*realB
XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs32 // realA*imagB
XSFADD_I2 vs1, vs1, vs33 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs34 // realA*realB
XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs34 // realA*imagB
XSFADD_I2 vs1, vs1, vs35 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs37,vs37) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs36 // realA*realB
XSFADD_R2 vs0, vs0, vs37 // imagA*imagB
XXSWAPD(vs36,vs36) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs37,vs37) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs36 // realA*imagB
XSFADD_I2 vs1, vs1, vs37 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs10, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs39,vs39) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs38 // realA*realB
XSFADD_R2 vs0, vs0, vs39 // imagA*imagB
XXSWAPD(vs38,vs38) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs39,vs39) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs38 // realA*imagB
XSFADD_I2 vs1, vs1, vs39 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs11, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
xvadddp vs9, vs9, vs17
xvadddp vs10, vs10, vs18
xvadddp vs11, vs11, vs19
#endif
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
stxvd2x vs10, o32, T1
stxvd2x vs11, o48, T1
add T1, T1, LDC
addi CO, CO, 64
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=2
**********************************************************************************************/
#if defined(_AIX)
define(`LOAD1x2_1', `
#else
.macro LOAD1x2_1
#endif
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
addi AO, AO, 32
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x2_I1', `
#else
.macro KERNEL1x2_I1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x2_1', `
#else
.macro KERNEL1x2_1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
lxvd2x vs9, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x2_2', `
#else
.macro KERNEL1x2_2
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x2_E2', `
#else
.macro KERNEL1x2_E2
#endif
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
xvmaddadp vs34, vs9, vs20 // real*real, imag*real
xvmaddadp vs35, vs9, vs21 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x2_SUBI1', `
#else
.macro KERNEL1x2_SUBI1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
xvmuldp vs34, vs1, vs16 // real*real, imag*real
xvmuldp vs35, vs1, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x2_SUB1', `
#else
.macro KERNEL1x2_SUB1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
lxvd2x vs1, o16, AO // load real,imag from A
addi AO, AO, 32
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
xvmaddadp vs34, vs1, vs16 // real*real, imag*real
xvmaddadp vs35, vs1, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`SAVE1x2', `
#else
.macro SAVE1x2
#endif
mr T1, CO
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
lxvd2x vs17, o16, T1
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs32 // realA*realB
XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs32 // realA*imagB
XSFADD_I2 vs1, vs1, vs33 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs35,vs35) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs34 // realA*realB
XSFADD_R2 vs0, vs0, vs35 // imagA*imagB
XXSWAPD(vs34,vs34) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs35,vs35) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs34 // realA*imagB
XSFADD_I2 vs1, vs1, vs35 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs9, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
xvadddp vs9, vs9, vs17
#endif
stxvd2x vs8, o0, T1
stxvd2x vs9, o16, T1
add T1, T1, LDC
addi CO, CO, 32
#if defined(_AIX)
')
#else
.endm
#endif
/**********************************************************************************************
* Macros for N=1 and M=1
**********************************************************************************************/
#if defined(_AIX)
define(`LOAD1x1_1', `
#else
.macro LOAD1x1_1
#endif
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
lxvd2x vs0, o0, AO // load real,imag from A
addi AO, AO, 16
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x1_I1', `
#else
.macro KERNEL1x1_I1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x1_1', `
#else
.macro KERNEL1x1_1
#endif
lxvd2x vs8, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs20, o0, BO // load real part from B
lxvd2x vs21, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x1_2', `
#else
.macro KERNEL1x1_2
#endif
lxvd2x vs0, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x1_E2', `
#else
.macro KERNEL1x1_E2
#endif
xvmaddadp vs32, vs8, vs20 // real*real, imag*real
xvmaddadp vs33, vs8, vs21 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x1_SUBI1', `
#else
.macro KERNEL1x1_SUBI1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmuldp vs32, vs0, vs16 // real*real, imag*real
xvmuldp vs33, vs0, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`KERNEL1x1_SUB1', `
#else
.macro KERNEL1x1_SUB1
#endif
lxvd2x vs0, o0, AO // load real,imag from A
addi AO, AO, 16
lxvd2x vs16, o0, BO // load real part from B
lxvd2x vs17, o16, BO // load imag part from B
addi BO, BO, 32
xvmaddadp vs32, vs0, vs16 // real*real, imag*real
xvmaddadp vs33, vs0, vs17 // real*imag, imag*imag
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`SAVE1x1', `
#else
.macro SAVE1x1
#endif
mr T1, CO
#ifndef TRMMKERNEL
lxvd2x vs16, o0, T1
#endif
xxlxor vs0, vs0, vs0
xxlxor vs1, vs1, vs1
XXSWAPD(vs33,vs33) // realA*imagB, imagA*imagB -> imagA*imagB, realA*imagB
XSFADD_R1 vs0, vs0, vs32 // realA*realB
XSFADD_R2 vs0, vs0, vs33 // imagA*imagB
XXSWAPD(vs32,vs32) // realA*realB, imagA*realB -> imagA*realB, realA*realB
XXSWAPD(vs33,vs33) // imagA*imagB, realA*imagB -> realA*imagB, imagA*imagB
XSFADD_I1 vs1, vs1, vs32 // realA*imagB
XSFADD_I2 vs1, vs1, vs33 // imagA*realB
xsmuldp vs4, vs0, alpha_r // real*alpha_r
xsmuldp vs5, vs1, alpha_i // imag*alpha_i
xsmuldp vs6, vs0, alpha_i // real*alpha_i
xsmuldp vs7, vs1, alpha_r // imag*alpha_r
xssubdp vs2, vs4, vs5 // real*alpha_r - imag*alpha_i
xsadddp vs3, vs6, vs7 // real*alpha_i + imag*alpha_r
xxpermdi vs8, vs2, vs3, 0 // merge real and imag part
#ifndef TRMMKERNEL
xvadddp vs8, vs8, vs16
#endif
stxvd2x vs8, o0, T1
add T1, T1, LDC
addi CO, CO, 16
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`ZCOPYB_1x1', `
#else
.macro ZCOPYB_1x1
#endif
lxvdsx vs4, o0, BO // b0_r
lxvdsx vs5, o8, BO // b0_i
addi BO, BO, 16
stxvd2x vs4, o0, BBO
stxvd2x vs5, o16, BBO
addi BBO, BBO, 32
#if defined(_AIX)
')
#else
.endm
#endif
#if defined(_AIX)
define(`ZCOPYB_8x1', `
#else
.macro ZCOPYB_8x1
#endif
lxvd2x vs32, o0, BO
lxvd2x vs33, o16, BO
lxvd2x vs34, o32, BO
lxvd2x vs35, o48, BO
addi BO, BO, 64
lxvd2x vs36, o0, BO
lxvd2x vs37, o16, BO
lxvd2x vs38, o32, BO
lxvd2x vs39, o48, BO
addi BO, BO, 64
XXSPLTD(vs40,vs32,0)
XXSPLTD(vs41,vs32,1)
XXSPLTD(vs42,vs33,0)
XXSPLTD(vs43,vs33,1)
XXSPLTD(vs44,vs34,0)
XXSPLTD(vs45,vs34,1)
XXSPLTD(vs46,vs35,0)
XXSPLTD(vs47,vs35,1)
XXSPLTD(vs48,vs36,0)
XXSPLTD(vs49,vs36,1)
XXSPLTD(vs50,vs37,0)
XXSPLTD(vs51,vs37,1)
XXSPLTD(vs52,vs38,0)
XXSPLTD(vs53,vs38,1)
XXSPLTD(vs54,vs39,0)
XXSPLTD(vs55,vs39,1)
stxvd2x vs40, o0, BBO
stxvd2x vs41, o16, BBO
stxvd2x vs42, o32, BBO
stxvd2x vs43, o48, BBO
addi BBO, BBO, 64
stxvd2x vs44, o0, BBO
stxvd2x vs45, o16, BBO
stxvd2x vs46, o32, BBO
stxvd2x vs47, o48, BBO
addi BBO, BBO, 64
stxvd2x vs48, o0, BBO
stxvd2x vs49, o16, BBO
stxvd2x vs50, o32, BBO
stxvd2x vs51, o48, BBO
addi BBO, BBO, 64
stxvd2x vs52, o0, BBO
stxvd2x vs53, o16, BBO
stxvd2x vs54, o32, BBO
stxvd2x vs55, o48, BBO
addi BBO, BBO, 64
#if defined(_AIX)
')
#else
.endm
#endif