OpenBLAS/kernel/loongarch64/caxpby_lsx.S

1030 lines
24 KiB
ArmAsm

#define ASSEMBLER
#include "common.h"
#define N $r4
#define ALPHAR $f0
#define ALPHAI $f1
#define X $r5
#define INCX $r6
#define BETAR $f2
#define BETAI $f3
#define Y $r7
#define INCY $r8
#define I $r12
#define TEMP $r13
#define t1 $r14
#define t2 $r16
#define t3 $r15
#define t4 $r17
#define XX $r18
#define YY $r19
#define a1 $f12
#define a2 $f13
#define a3 $f14
#define a4 $f15
#define s1 $f16
#define s2 $f17
#define s3 $f18
#define s4 $f19
#define VX0 $vr8
#define VX1 $vr20
#define VX2 $vr21
#define VX3 $vr22
#define VXAR $vr23
#define VXAI $vr19
#define VXBR $vr14
#define VXBI $vr13
#define VXZ $vr12
#define x1 $vr18
#define x2 $vr17
#define x3 $vr16
#define x4 $vr15
PROLOGUE
bge $r0, N, .L999
movgr2fr.d a1, $r0
#ifdef DOUBLE
ffint.d.l a1, a1
#else
ffint.s.l a1, a1
#endif
slli.d INCX, INCX, ZBASE_SHIFT
slli.d INCY, INCY, ZBASE_SHIFT
#ifdef DOUBLE
movfr2gr.d t1, ALPHAR
vreplgr2vr.d VXAR, t1
movfr2gr.d t2, ALPHAI
vreplgr2vr.d VXAI, t2
movfr2gr.d t3, BETAR
vreplgr2vr.d VXBR, t3
movfr2gr.d t4, BETAI
vreplgr2vr.d VXBI, t4
#else
movfr2gr.s t1, ALPHAR
vreplgr2vr.w VXAR, t1
movfr2gr.s t2, ALPHAI
vreplgr2vr.w VXAI, t2
movfr2gr.s t3, BETAR
vreplgr2vr.w VXBR, t3
movfr2gr.s t4, BETAI
vreplgr2vr.w VXBI, t4
#endif
vxor.v VXZ, VXZ, VXZ
// If incx == 0 || incy == 0, do one by one
and TEMP, INCX, INCY
or I, N, N
beqz TEMP, .L998
li.d TEMP, 1
slli.d TEMP, TEMP, ZBASE_SHIFT
srai.d I, N, 2
bne INCX, TEMP, .L20
bne INCY, TEMP, .L12 // INCX==1 and INCY!=1
b .L11 // INCX==1 and INCY==1
.L20:
bne INCY, TEMP, .L22 // INCX!=1 and INCY!=1
b .L21 // INCX!=1 and INCY==1
.L11:
bge $r0, I, .L997
#ifdef DOUBLE
fcmp.ceq.d $fcc0, BETAR, a1
fcmp.ceq.d $fcc1, BETAI, a1
fcmp.ceq.d $fcc2, ALPHAR, a1
fcmp.ceq.d $fcc3, ALPHAI, a1
#else
fcmp.ceq.s $fcc0, BETAR, a1
fcmp.ceq.s $fcc1, BETAI, a1
fcmp.ceq.s $fcc2, ALPHAR, a1
fcmp.ceq.s $fcc3, ALPHAI, a1
#endif
bceqz $fcc0, .L13
bceqz $fcc1, .L13
b .L14
.align 3
.L13:
bceqz $fcc2, .L114
bceqz $fcc3, .L114 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
b .L113 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
.L14:
bceqz $fcc2, .L112
bceqz $fcc3, .L112 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
b .L111 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
.align 3
.L111: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
#ifdef DOUBLE
vst VXZ, Y, 0 * SIZE
vst VXZ, Y, 2 * SIZE
vst VXZ, Y, 4 * SIZE
vst VXZ, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3
#else
vst VXZ, Y, 0 * SIZE
vst VXZ, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L111
b .L997
.align 3
#endif
.L112: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
#ifdef DOUBLE
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vfmul.d x3, VXAI, x2
vfmul.d x4, VXAI, x1
vfmsub.d x3, VXAR, x1, x3
vfmadd.d x4, VXAR, x2, x4
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vfmul.d x3, VXAI, x2
vfmul.d x4, VXAI, x1
vfmsub.d x3, VXAR, x1, x3
vfmadd.d x4, VXAR, x2, x4
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
vst VX2, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L112
b .L997
.align 3
#else
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vpickev.w x1, VX1, VX0
vpickod.w x2, VX1, VX0
vfmul.s x3, VXAI, x2
vfmul.s x4, VXAI, x1
vfmsub.s x3, VXAR, x1, x3
vfmadd.s x4, VXAR, x2, x4
vilvl.w VX2, x4 ,x3
vilvh.w VX3, x4, x3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L112
b .L997
.align 3
#endif
.L113: //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
#ifdef DOUBLE
vld VX0, Y, 0 * SIZE
vld VX1, Y, 2 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vfmul.d x3, VXBI, x2
vfmul.d x4, VXBI, x1
vfmsub.d x3, VXBR, x1, x3
vfmadd.d x4, VXBR, x2, x4
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 2 * SIZE
vld VX0, Y, 4 * SIZE
vld VX1, Y, 6 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vfmul.d x3, VXBI, x2
vfmul.d x4, VXBI, x1
vfmsub.d x3, VXBR, x1, x3
vfmadd.d x4, VXBR, x2, x4
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
vst VX2, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L113
b .L997
.align 3
#else
vld VX0, Y, 0 * SIZE
vld VX1, Y, 4 * SIZE
vpickev.w x1, VX1, VX0
vpickod.w x2, VX1, VX0
vfmul.s x3, VXBI, x2
vfmul.s x4, VXBI, x1
vfmsub.s x3, VXBR, x1, x3
vfmadd.s x4, VXBR, x2, x4
vilvl.w VX2, x4 ,x3
vilvh.w VX3, x4, x3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L113
b .L997
.align 3
#endif
.L114:
#ifdef DOUBLE
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
vld VX2, Y, 0 * SIZE
vld VX3, Y, 2 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vpickev.d x3, VX3, VX2
vpickod.d x4, VX3, VX2
vfmul.d VX0, VXAI, x2
vfmul.d VX1, VXAI, x1
vfmul.d VX2, VXBI, x4
vfmul.d VX3, VXBI, x3
vfmsub.d VX0, VXAR, x1, VX0
vfmadd.d VX1, VXAR, x2, VX1
vfmsub.d VX2, VXBR, x3, VX2
vfmadd.d VX3, VXBR, x4, VX3
vfadd.d x3, VX0, VX2
vfadd.d x4, VX1, VX3
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 2 * SIZE
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
vld VX2, Y, 4 * SIZE
vld VX3, Y, 6 * SIZE
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vpickev.d x3, VX3, VX2
vpickod.d x4, VX3, VX2
vfmul.d VX0, VXAI, x2
vfmul.d VX1, VXAI, x1
vfmul.d VX2, VXBI, x4
vfmul.d VX3, VXBI, x3
vfmsub.d VX0, VXAR, x1, VX0
vfmadd.d VX1, VXAR, x2, VX1
vfmsub.d VX2, VXBR, x3, VX2
vfmadd.d VX3, VXBR, x4, VX3
vfadd.d x3, VX0, VX2
vfadd.d x4, VX1, VX3
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
vst VX2, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3
#else
vld VX0, X, 0 * SIZE
vld VX1, X, 4 * SIZE
vld VX2, Y, 0 * SIZE
vld VX3, Y, 4 * SIZE
vpickev.w x1, VX1, VX0
vpickod.w x2, VX1, VX0
vpickev.w x3, VX3, VX2
vpickod.w x4, VX3, VX2
vfmul.s VX0, VXAI, x2
vfmul.s VX1, VXAI, x1
vfmul.s VX2, VXBI, x4
vfmul.s VX3, VXBI, x3
vfmsub.s VX0, VXAR, x1, VX0
vfmadd.s VX1, VXAR, x2, VX1
vfmsub.s VX2, VXBR, x3, VX2
vfmadd.s VX3, VXBR, x4, VX3
vfadd.s x3, VX0, VX2
vfadd.s x4, VX1, VX3
vilvl.w VX2, x4 ,x3
vilvh.w VX3, x4, x3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 4 * SIZE
addi.d X, X, 8 * SIZE
addi.d Y, Y, 8 * SIZE
addi.d I, I, -1
blt $r0, I, .L114
b .L997
.align 3
#endif
.L12: // INCX==1 and INCY!=1
bge $r0, I, .L997
move YY, Y
.align 3
.L121:
#ifdef DOUBLE
vld VX0, X, 0 * SIZE
vld VX1, X, 2 * SIZE
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x3, t1, 0
vinsgr2vr.d x4, t2, 0
vinsgr2vr.d x3, t3, 1
vinsgr2vr.d x4, t4, 1
add.d Y, Y, INCY
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vfmul.d VX0, VXAI, x2
vfmul.d VX1, VXAI, x1
vfmul.d VX2, VXBI, x4
vfmul.d VX3, VXBI, x3
vfmsub.d VX0, VXAR, x1, VX0
vfmadd.d VX1, VXAR, x2, VX1
vfmsub.d VX2, VXBR, x3, VX2
vfmadd.d VX3, VXBR, x4, VX3
vfadd.d x3, VX0, VX2
vfadd.d x4, VX1, VX3
vstelm.d x3, YY, 0 * SIZE, 0
vstelm.d x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d x3, YY, 0 * SIZE, 1
vstelm.d x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
vld VX0, X, 4 * SIZE
vld VX1, X, 6 * SIZE
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x3, t1, 0
vinsgr2vr.d x4, t2, 0
vinsgr2vr.d x3, t3, 1
vinsgr2vr.d x4, t4, 1
add.d Y, Y, INCY
vpickev.d x1, VX1, VX0
vpickod.d x2, VX1, VX0
vfmul.d VX0, VXAI, x2
vfmul.d VX1, VXAI, x1
vfmul.d VX2, VXBI, x4
vfmul.d VX3, VXBI, x3
vfmsub.d VX0, VXAR, x1, VX0
vfmadd.d VX1, VXAR, x2, VX1
vfmsub.d VX2, VXBR, x3, VX2
vfmadd.d VX3, VXBR, x4, VX3
vfadd.d x3, VX0, VX2
vfadd.d x4, VX1, VX3
addi.d I, I, -1
vstelm.d x3, YY, 0 * SIZE, 0
vstelm.d x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d x3, YY, 0 * SIZE, 1
vstelm.d x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L121
b .L997
.align 3
#else
vld VX0, X, 0 * SIZE
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
add.d Y, Y, INCY
vinsgr2vr.w x3, t1, 0
vinsgr2vr.w x4, t2, 0
vinsgr2vr.w x3, t3, 1
vinsgr2vr.w x4, t4, 1
vld VX1, X, 4 * SIZE
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
vinsgr2vr.w x3, t1, 2
vinsgr2vr.w x4, t2, 2
vinsgr2vr.w x3, t3, 3
vinsgr2vr.w x4, t4, 3
add.d Y, Y, INCY
vpickev.w x1, VX1, VX0
vpickod.w x2, VX1, VX0
vfmul.s VX0, VXAI, x2
vfmul.s VX1, VXAI, x1
vfmul.s VX2, VXBI, x4
vfmul.s VX3, VXBI, x3
vfmsub.s VX0, VXAR, x1, VX0
vfmadd.s VX1, VXAR, x2, VX1
vfmsub.s VX2, VXBR, x3, VX2
vfmadd.s VX3, VXBR, x4, VX3
vfadd.s x3, VX0, VX2
vfadd.s x4, VX1, VX3
addi.d I, I, -1
vstelm.w x3, YY, 0 * SIZE, 0
vstelm.w x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 1
vstelm.w x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 2
vstelm.w x4, YY, 1 * SIZE, 2
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 3
vstelm.w x4, YY, 1 * SIZE, 3
add.d YY, YY, INCY
addi.d X, X, 8 * SIZE
blt $r0, I, .L121
b .L997
.align 3
#endif
.L21:// INCX!=1 and INCY==1
bge $r0, I, .L997
.align 3
.L211:
#ifdef DOUBLE
vld VX2, Y, 0 * SIZE
vld VX3, Y, 2 * SIZE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
add.d X, X, INCX
vpickev.d x3, VX3, VX2
vpickod.d x4, VX3, VX2
vfmul.d VX0, VXAI, x2
vfmul.d VX1, VXAI, x1
vfmul.d VX2, VXBI, x4
vfmul.d VX3, VXBI, x3
vfmsub.d VX0, VXAR, x1, VX0
vfmadd.d VX1, VXAR, x2, VX1
vfmsub.d VX2, VXBR, x3, VX2
vfmadd.d VX3, VXBR, x4, VX3
vfadd.d x3, VX0, VX2
vfadd.d x4, VX1, VX3
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
vst VX2, Y, 0 * SIZE
vst VX3, Y, 2 * SIZE
vld VX2, Y, 4 * SIZE
vld VX3, Y, 6 * SIZE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
add.d X, X, INCX
vpickev.d x3, VX3, VX2
vpickod.d x4, VX3, VX2
vfmul.d VX0, VXAI, x2
vfmul.d VX1, VXAI, x1
vfmul.d VX2, VXBI, x4
vfmul.d VX3, VXBI, x3
vfmsub.d VX0, VXAR, x1, VX0
vfmadd.d VX1, VXAR, x2, VX1
vfmsub.d VX2, VXBR, x3, VX2
vfmadd.d VX3, VXBR, x4, VX3
vfadd.d x3, VX0, VX2
vfadd.d x4, VX1, VX3
vilvl.d VX2, x4 ,x3
vilvh.d VX3, x4, x3
addi.d I, I, -1
vst VX3, Y, 4 * SIZE
vst VX3, Y, 6 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
b .L997
.align 3
#else
vld VX2, Y, 0 * SIZE
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.w x1, t1, 0
vinsgr2vr.w x2, t2, 0
vinsgr2vr.w x1, t3, 1
vinsgr2vr.w x2, t4, 1
vld VX3, Y, 4 * SIZE
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
vinsgr2vr.w x1, t1, 2
vinsgr2vr.w x2, t2, 2
vinsgr2vr.w x1, t3, 3
vinsgr2vr.w x2, t4, 3
add.d X, X, INCX
vpickev.w x3, VX3, VX2
vpickod.w x4, VX3, VX2
vfmul.s VX0, VXAI, x2
vfmul.s VX1, VXAI, x1
vfmul.s VX2, VXBI, x4
vfmul.s VX3, VXBI, x3
vfmsub.s VX0, VXAR, x1, VX0
vfmadd.s VX1, VXAR, x2, VX1
vfmsub.s VX2, VXBR, x3, VX2
vfmadd.s VX3, VXBR, x4, VX3
vfadd.s x3, VX0, VX2
vfadd.s x4, VX1, VX3
vilvl.w VX2, x4 ,x3
vilvh.w VX3, x4, x3
addi.d I, I, -1
vst VX2, Y, 0 * SIZE
vst VX3, Y, 4 * SIZE
addi.d Y, Y, 8 * SIZE
blt $r0, I, .L211
b .L997
.align 3
#endif
.L22:
bge $r0, I, .L997
move YY, Y
#ifdef DOUBLE
fcmp.ceq.d $fcc0, BETAR, a1
fcmp.ceq.d $fcc1, BETAI, a1
fcmp.ceq.d $fcc2, ALPHAR, a1
fcmp.ceq.d $fcc3, ALPHAI, a1
#else
fcmp.ceq.s $fcc0, BETAR, a1
fcmp.ceq.s $fcc1, BETAI, a1
fcmp.ceq.s $fcc2, ALPHAR, a1
fcmp.ceq.s $fcc3, ALPHAI, a1
#endif
bceqz $fcc0, .L23
bceqz $fcc1, .L23
b .L24
.align 3
.L23:
bceqz $fcc2, .L224
bceqz $fcc3, .L224 //!(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
b .L223 //!(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
.align 3
.L24:
bceqz $fcc2, .L222
bceqz $fcc3, .L222 //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
b .L221 //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
.align 3
.L221: //(beta_r == 0.0 && beta_i == 0.0) and (alpha_r == 0.0 && alpha_i == 0.0)
#ifdef DOUBLE
vstelm.d VXZ, Y, 0, 0
vstelm.d VXZ, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VXZ, Y, 0, 0
vstelm.d VXZ, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VXZ, Y, 0, 0
vstelm.d VXZ, Y, 0, 0
add.d Y, Y, INCY
vstelm.d VXZ, Y, 0, 0
vstelm.d VXZ, Y, 0, 0
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L221
b .L997
.align 3
#else
vstelm.w VXZ, Y, 0, 0
vstelm.w VXZ, Y, 0, 0
add.d Y, Y, INCY
vstelm.w VXZ, Y, 0, 0
vstelm.w VXZ, Y, 0, 0
add.d Y, Y, INCY
vstelm.w VXZ, Y, 0, 0
vstelm.w VXZ, Y, 0, 0
add.d Y, Y, INCY
vstelm.w VXZ, Y, 0, 0
vstelm.w VXZ, Y, 0, 0
add.d Y, Y, INCY
addi.d I, I, -1
blt $r0, I, .L221
b .L997
.align 3
#endif
.L222: //(beta_r == 0.0 && beta_i == 0.0) and !(alpha_r == 0.0 && alpha_i == 0.0)
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
vfmul.d x3, VXAI, x2
vfmul.d x4, VXAI, x1
vfmsub.d x3, VXAR, x1, x3
vfmadd.d x4, VXAR, x2, x4
vstelm.d x3, YY, 0 * SIZE, 0
vstelm.d x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d x3, YY, 0 * SIZE, 1
vstelm.d x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
add.d X, X, INCX
vfmul.d x3, VXAI, x2
vfmul.d x4, VXAI, x1
vfmsub.d x3, VXAR, x1, x3
vfmadd.d x4, VXAR, x2, x4
addi.d I, I, -1
vstelm.d x3, YY, 0 * SIZE, 0
vstelm.d x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d x3, YY, 0 * SIZE, 1
vstelm.d x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
blt $r0, I, .L222
b .L997
.align 3
#else
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.w x1, t1, 0
vinsgr2vr.w x2, t2, 0
vinsgr2vr.w x1, t3, 1
vinsgr2vr.w x2, t4, 1
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
vinsgr2vr.w x1, t1, 2
vinsgr2vr.w x2, t2, 2
vinsgr2vr.w x1, t3, 3
vinsgr2vr.w x2, t4, 3
add.d X, X, INCX
vfmul.s x3, VXAI, x2
vfmul.s x4, VXAI, x1
vfmsub.s x3, VXAR, x1, x3
vfmadd.s x4, VXAR, x2, x4
addi.d I, I, -1
vstelm.w x3, YY, 0 * SIZE, 0
vstelm.w x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 1
vstelm.w x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 2
vstelm.w x4, YY, 1 * SIZE, 2
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 3
vstelm.w x4, YY, 1 * SIZE, 3
add.d YY, YY, INCY
blt $r0, I, .L222
b .L997
.align 3
#endif
.L223:
#ifdef DOUBLE
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
add.d Y, Y, INCY
vfmul.d x3, VXBI, x2
vfmul.d x4, VXBI, x1
vfmsub.d x3, VXBR, x1, x3
vfmadd.d x4, VXBR, x2, x4
vstelm.d x3, YY, 0 * SIZE, 0
vstelm.d x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d x3, YY, 0 * SIZE, 1
vstelm.d x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
add.d Y, Y, INCY
vfmul.d x3, VXBI, x2
vfmul.d x4, VXBI, x1
vfmsub.d x3, VXBR, x1, x3
vfmadd.d x4, VXBR, x2, x4
addi.d I, I, -1
vstelm.d x3, YY, 0 * SIZE, 0
vstelm.d x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d x3, YY, 0 * SIZE, 1
vstelm.d x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
blt $r0, I, .L223
b .L997
.align 3
#else
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
add.d Y, Y, INCY
vinsgr2vr.w x1, t1, 0
vinsgr2vr.w x2, t2, 0
vinsgr2vr.w x1, t3, 1
vinsgr2vr.w x2, t4, 1
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
vinsgr2vr.w x1, t1, 2
vinsgr2vr.w x2, t2, 2
vinsgr2vr.w x1, t3, 3
vinsgr2vr.w x2, t4, 3
add.d Y, Y, INCY
vfmul.s x3, VXBI, x2
vfmul.s x4, VXBI, x1
vfmsub.s x3, VXBR, x1, x3
vfmadd.s x4, VXBR, x2, x4
addi.d I, I, -1
vstelm.w x3, YY, 0 * SIZE, 0
vstelm.w x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 1
vstelm.w x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 2
vstelm.w x4, YY, 1 * SIZE, 2
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 3
vstelm.w x4, YY, 1 * SIZE, 3
add.d YY, YY, INCY
blt $r0, I, .L223
b .L997
.align 3
#endif
.L224:
#ifdef DOUBLE
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x3, t1, 0
vinsgr2vr.d x4, t2, 0
vinsgr2vr.d x3, t3, 1
vinsgr2vr.d x4, t4, 1
add.d Y, Y, INCY
vfmul.d VX0, VXAI, x2
vfmul.d VX1, VXAI, x1
vfmul.d VX2, VXBI, x4
vfmul.d VX3, VXBI, x3
vfmsub.d VX0, VXAR, x1, VX0
vfmadd.d VX1, VXAR, x2, VX1
vfmsub.d VX2, VXBR, x3, VX2
vfmadd.d VX3, VXBR, x4, VX3
vfadd.d x3, VX0, VX2
vfadd.d x4, VX1, VX3
vstelm.d x3, YY, 0 * SIZE, 0
vstelm.d x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d x3, YY, 0 * SIZE, 1
vstelm.d x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
ld.d t1, X, 0 * SIZE
ld.d t2, X, 1 * SIZE
add.d X, X, INCX
ld.d t3, X, 0 * SIZE
ld.d t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.d x1, t1, 0
vinsgr2vr.d x2, t2, 0
vinsgr2vr.d x1, t3, 1
vinsgr2vr.d x2, t4, 1
ld.d t1, Y, 0 * SIZE
ld.d t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.d t3, Y, 0 * SIZE
ld.d t4, Y, 1 * SIZE
vinsgr2vr.d x3, t1, 0
vinsgr2vr.d x4, t2, 0
vinsgr2vr.d x3, t3, 1
vinsgr2vr.d x4, t4, 1
add.d Y, Y, INCY
vfmul.d VX0, VXAI, x2
vfmul.d VX1, VXAI, x1
vfmul.d VX2, VXBI, x4
vfmul.d VX3, VXBI, x3
vfmsub.d VX0, VXAR, x1, VX0
vfmadd.d VX1, VXAR, x2, VX1
vfmsub.d VX2, VXBR, x3, VX2
vfmadd.d VX3, VXBR, x4, VX3
vfadd.d x3, VX0, VX2
vfadd.d x4, VX1, VX3
vstelm.d x3, YY, 0 * SIZE, 0
vstelm.d x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.d x3, YY, 0 * SIZE, 1
vstelm.d x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
addi.d I, I, -1
blt $r0, I, .L224
b .L997
.align 3
#else
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.w x1, t1, 0
vinsgr2vr.w x2, t2, 0
vinsgr2vr.w x1, t3, 1
vinsgr2vr.w x2, t4, 1
ld.w t1, X, 0 * SIZE
ld.w t2, X, 1 * SIZE
add.d X, X, INCX
ld.w t3, X, 0 * SIZE
ld.w t4, X, 1 * SIZE
add.d X, X, INCX
vinsgr2vr.w x1, t1, 2
vinsgr2vr.w x2, t2, 2
vinsgr2vr.w x1, t3, 3
vinsgr2vr.w x2, t4, 3
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
add.d Y, Y, INCY
vinsgr2vr.w x3, t1, 0
vinsgr2vr.w x4, t2, 0
vinsgr2vr.w x3, t3, 1
vinsgr2vr.w x4, t4, 1
ld.w t1, Y, 0 * SIZE
ld.w t2, Y, 1 * SIZE
add.d Y, Y, INCY
ld.w t3, Y, 0 * SIZE
ld.w t4, Y, 1 * SIZE
vinsgr2vr.w x3, t1, 2
vinsgr2vr.w x4, t2, 2
vinsgr2vr.w x3, t3, 3
vinsgr2vr.w x4, t4, 3
add.d Y, Y, INCY
vfmul.s VX0, VXAI, x2
vfmul.s VX1, VXAI, x1
vfmul.s VX2, VXBI, x4
vfmul.s VX3, VXBI, x3
vfmsub.s VX0, VXAR, x1, VX0
vfmadd.s VX1, VXAR, x2, VX1
vfmsub.s VX2, VXBR, x3, VX2
vfmadd.s VX3, VXBR, x4, VX3
vfadd.s x3, VX0, VX2
vfadd.s x4, VX1, VX3
addi.d I, I, -1
vstelm.w x3, YY, 0 * SIZE, 0
vstelm.w x4, YY, 1 * SIZE, 0
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 1
vstelm.w x4, YY, 1 * SIZE, 1
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 2
vstelm.w x4, YY, 1 * SIZE, 2
add.d YY, YY, INCY
vstelm.w x3, YY, 0 * SIZE, 3
vstelm.w x4, YY, 1 * SIZE, 3
add.d YY, YY, INCY
blt $r0, I, .L224
b .L997
.align 3
#endif
.L997:
andi I, N, 3
bge $r0, I, .L999
.align 3
.L998:
#ifdef DOUBLE
fld.d a1, X, 0 * SIZE
fld.d a2, X, 1 * SIZE
fld.d a3, Y, 0 * SIZE
fld.d a4, Y, 1 * SIZE
addi.d I, I, -1
fmul.d s1, ALPHAI, a2
fmul.d s2, ALPHAI, a1
fmul.d s3, BETAI, a4
fmul.d s4, BETAI, a3
fmsub.d s1, ALPHAR, a1, s1
fmadd.d s2, a2, ALPHAR, s2
fmsub.d s3, BETAR, a3, s3
fmadd.d s4, a4, BETAR, s4
fadd.d s3, s3, s1
fadd.d s4, s4, s2
fst.d s3, Y, 0 * SIZE
fst.d s4, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L998
.align 3
#else
fld.s a1, X, 0 * SIZE
fld.s a2, X, 1 * SIZE
fld.s a3, Y, 0 * SIZE
fld.s a4, Y, 1 * SIZE
addi.d I, I, -1
fmul.s s1, ALPHAI, a2
fmul.s s2, ALPHAI, a1
fmul.s s3, BETAI, a4
fmul.s s4, BETAI, a3
fmsub.s s1, ALPHAR, a1, s1
fmadd.s s2, a2, ALPHAR, s2
fmsub.s s3, BETAR, a3, s3
fmadd.s s4, a4, BETAR, s4
fadd.s s3, s3, s1
fadd.s s4, s4, s2
fst.s s3, Y, 0 * SIZE
fst.s s4, Y, 1 * SIZE
add.d X, X, INCX
add.d Y, Y, INCY
blt $r0, I, .L998
.align 3
#endif
.L999:
move $r4, $r12
jirl $r0, $r1, 0x0
.align 3
EPILOGUE