loongarch: Fixed i{c/z}amin LSX opt
This commit is contained in:
parent
56d114b245
commit
ac460eb42a
|
@ -70,18 +70,17 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
LD a1, X, 1 * SIZE
|
LD a1, X, 1 * SIZE
|
||||||
FABS a0, a0
|
FABS a0, a0
|
||||||
FABS a1, a1
|
FABS a1, a1
|
||||||
ADD s1, a1, a0
|
ADD s1, a1, a0 // Initialization value
|
||||||
vreplvei.w VM0, VM0, 0
|
|
||||||
vxor.v VI3, VI3, VI3 // 0
|
vxor.v VI3, VI3, VI3 // 0
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
li.d I, -1
|
li.d I, -1
|
||||||
vreplgr2vr.d VI4, I
|
vreplgr2vr.d VI4, I
|
||||||
vffint.d.l VI4, VI4 // -1
|
vffint.d.l VI4, VI4 // -1
|
||||||
bne INCX, TEMP, .L20
|
bne INCX, TEMP, .L20 // incx != 1
|
||||||
|
|
||||||
|
// Init Index
|
||||||
addi.d i0, i0, 1
|
addi.d i0, i0, 1
|
||||||
srai.d I, N, 2
|
slli.d i0, i0, 1 // 2
|
||||||
bge $r0, I, .L21
|
|
||||||
slli.d i0, i0, 1 //2
|
|
||||||
vreplgr2vr.d VINC4, i0
|
vreplgr2vr.d VINC4, i0
|
||||||
addi.d i0, i0, -3
|
addi.d i0, i0, -3
|
||||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||||
|
@ -91,14 +90,30 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
vinsgr2vr.d VI0, i0, 0 //1
|
vinsgr2vr.d VI0, i0, 0 //1
|
||||||
addi.d i0, i0, 1
|
addi.d i0, i0, 1
|
||||||
vinsgr2vr.d VI0, i0, 1 //2
|
vinsgr2vr.d VI0, i0, 1 //2
|
||||||
|
|
||||||
|
srai.d I, N, 2
|
||||||
|
bge $r0, I, .L21
|
||||||
|
|
||||||
|
// Init VM0
|
||||||
|
vld VX0, X, 0 * SIZE
|
||||||
|
vld VX1, X, 2 * SIZE
|
||||||
|
vpickev.d x1, VX1, VX0
|
||||||
|
vpickod.d x2, VX1, VX0
|
||||||
|
vfmul.d x3, VI4, x1
|
||||||
|
vfmul.d x4, VI4, x2
|
||||||
|
vfcmp.clt.d VT0, x1, VI3
|
||||||
|
vfcmp.clt.d VINC8, x2, VI3
|
||||||
|
vbitsel.v x1, x1, x3, VT0
|
||||||
|
vbitsel.v x2, x2, x4, VINC8
|
||||||
|
vfadd.d VM0, x1, x2
|
||||||
#else
|
#else
|
||||||
li.w I, -1
|
li.w I, -1
|
||||||
vreplgr2vr.w VI4, I
|
vreplgr2vr.w VI4, I
|
||||||
vffint.s.w VI4, VI4 // -1
|
vffint.s.w VI4, VI4 // -1
|
||||||
bne INCX, TEMP, .L20
|
bne INCX, TEMP, .L20 // incx != 1
|
||||||
|
|
||||||
|
// Init Index
|
||||||
addi.w i0, i0, 1
|
addi.w i0, i0, 1
|
||||||
srai.d I, N, 2
|
|
||||||
bge $r0, I, .L21
|
|
||||||
slli.w i0, i0, 2 //4
|
slli.w i0, i0, 2 //4
|
||||||
vreplgr2vr.w VINC4, i0
|
vreplgr2vr.w VINC4, i0
|
||||||
addi.w i0, i0, -7
|
addi.w i0, i0, -7
|
||||||
|
@ -117,6 +132,22 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
vinsgr2vr.w VI0, i0, 2 //3
|
vinsgr2vr.w VI0, i0, 2 //3
|
||||||
addi.w i0, i0, 1
|
addi.w i0, i0, 1
|
||||||
vinsgr2vr.w VI0, i0, 3 //4
|
vinsgr2vr.w VI0, i0, 3 //4
|
||||||
|
|
||||||
|
srai.d I, N, 2
|
||||||
|
bge $r0, I, .L21
|
||||||
|
|
||||||
|
// Init VM0
|
||||||
|
vld VX0, X, 0 * SIZE
|
||||||
|
vld VX1, X, 4 * SIZE
|
||||||
|
vpickev.w x1, VX1, VX0
|
||||||
|
vpickod.w x2, VX1, VX0
|
||||||
|
vfmul.s x3, VI4, x1
|
||||||
|
vfmul.s x4, VI4, x2
|
||||||
|
vfcmp.clt.s VT0, x1, VI3
|
||||||
|
vfcmp.clt.s VINC8, x2, VI3
|
||||||
|
vbitsel.v x1, x1, x3, VT0
|
||||||
|
vbitsel.v x2, x2, x4, VINC8
|
||||||
|
vfadd.s VM0, x1, x2
|
||||||
#endif
|
#endif
|
||||||
.align 3
|
.align 3
|
||||||
|
|
||||||
|
@ -139,6 +170,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
vfcmp.ceq.d VT0, x3, VM0
|
vfcmp.ceq.d VT0, x3, VM0
|
||||||
vbitsel.v VM0, x3, VM0, VT0
|
vbitsel.v VM0, x3, VM0, VT0
|
||||||
vbitsel.v VI0, VI1, VI0, VT0
|
vbitsel.v VI0, VI1, VI0, VT0
|
||||||
|
|
||||||
vld VX0, X, 4 * SIZE
|
vld VX0, X, 4 * SIZE
|
||||||
vadd.d VI1, VI1, VINC4
|
vadd.d VI1, VI1, VINC4
|
||||||
vld VX1, X, 6 * SIZE
|
vld VX1, X, 6 * SIZE
|
||||||
|
@ -206,9 +238,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
.L20: // INCX!=1
|
.L20: // INCX!=1
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
addi.d i0, i0, 1
|
addi.d i0, i0, 1
|
||||||
srai.d I, N, 2
|
// Init index
|
||||||
bge $r0, I, .L21
|
slli.d i0, i0, 1 //2
|
||||||
slli.d i0, i0, 1 //2
|
|
||||||
vreplgr2vr.d VINC4, i0
|
vreplgr2vr.d VINC4, i0
|
||||||
addi.d i0, i0, -3
|
addi.d i0, i0, -3
|
||||||
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
vinsgr2vr.d VI1, i0, 0 //initialize the index value for vectorization
|
||||||
|
@ -218,10 +249,32 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
vinsgr2vr.d VI0, i0, 0 //1
|
vinsgr2vr.d VI0, i0, 0 //1
|
||||||
addi.d i0, i0, 1
|
addi.d i0, i0, 1
|
||||||
vinsgr2vr.d VI0, i0, 1 //2
|
vinsgr2vr.d VI0, i0, 1 //2
|
||||||
|
|
||||||
|
srai.d I, N, 2
|
||||||
|
bge $r0, I, .L21 // N < 4
|
||||||
|
|
||||||
|
// Init VM0
|
||||||
|
ld.d t1, X, 0 * SIZE
|
||||||
|
ld.d t2, X, 1 * SIZE
|
||||||
|
add.d i1, X, INCX
|
||||||
|
ld.d t3, i1, 0 * SIZE
|
||||||
|
ld.d t4, i1, 1 * SIZE
|
||||||
|
add.d i1, i1, INCX
|
||||||
|
vinsgr2vr.d x1, t1, 0
|
||||||
|
vinsgr2vr.d x2, t2, 0
|
||||||
|
vinsgr2vr.d x1, t3, 1
|
||||||
|
vinsgr2vr.d x2, t4, 1
|
||||||
|
vfmul.d x3, VI4, x1
|
||||||
|
vfmul.d x4, VI4, x2
|
||||||
|
vfcmp.clt.d VT0, x1, VI3
|
||||||
|
vfcmp.clt.d VINC8, x2, VI3
|
||||||
|
vbitsel.v x1, x1, x3, VT0
|
||||||
|
vbitsel.v x2, x2, x4, VINC8
|
||||||
|
vfadd.d VM0, x1, x2
|
||||||
#else
|
#else
|
||||||
addi.w i0, i0, 1
|
addi.w i0, i0, 1
|
||||||
srai.d I, N, 2
|
|
||||||
bge $r0, I, .L21
|
// Init index
|
||||||
slli.w i0, i0, 2 //4
|
slli.w i0, i0, 2 //4
|
||||||
vreplgr2vr.w VINC4, i0
|
vreplgr2vr.w VINC4, i0
|
||||||
addi.w i0, i0, -7
|
addi.w i0, i0, -7
|
||||||
|
@ -240,6 +293,36 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
vinsgr2vr.w VI0, i0, 2 //3
|
vinsgr2vr.w VI0, i0, 2 //3
|
||||||
addi.w i0, i0, 1
|
addi.w i0, i0, 1
|
||||||
vinsgr2vr.w VI0, i0, 3 //4
|
vinsgr2vr.w VI0, i0, 3 //4
|
||||||
|
|
||||||
|
srai.d I, N, 2
|
||||||
|
bge $r0, I, .L21 // N < 4
|
||||||
|
|
||||||
|
// Init VM0
|
||||||
|
ld.w t1, X, 0 * SIZE
|
||||||
|
ld.w t2, X, 1 * SIZE
|
||||||
|
add.d i1, X, INCX
|
||||||
|
ld.w t3, i1, 0 * SIZE
|
||||||
|
ld.w t4, i1, 1 * SIZE
|
||||||
|
add.d i1, i1, INCX
|
||||||
|
vinsgr2vr.w x1, t1, 0
|
||||||
|
vinsgr2vr.w x2, t2, 0
|
||||||
|
vinsgr2vr.w x1, t3, 1
|
||||||
|
vinsgr2vr.w x2, t4, 1
|
||||||
|
ld.w t1, i1, 0 * SIZE
|
||||||
|
ld.w t2, i1, 1 * SIZE
|
||||||
|
add.d i1, i1, INCX
|
||||||
|
ld.w t3, i1, 0 * SIZE
|
||||||
|
ld.w t4, i1, 1 * SIZE
|
||||||
|
add.d i1, i1, INCX
|
||||||
|
vinsgr2vr.w x1, t1, 2
|
||||||
|
vinsgr2vr.w x2, t2, 2
|
||||||
|
vinsgr2vr.w x1, t3, 3
|
||||||
|
vinsgr2vr.w x2, t4, 3
|
||||||
|
vfcmp.clt.s VT0, x1, VI3
|
||||||
|
vfcmp.clt.s VINC8, x2, VI3
|
||||||
|
vbitsel.v x1, x1, x3, VT0
|
||||||
|
vbitsel.v x2, x2, x4, VINC8
|
||||||
|
vfadd.s VM0, x1, x2
|
||||||
#endif
|
#endif
|
||||||
.align 3
|
.align 3
|
||||||
|
|
||||||
|
@ -300,8 +383,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
vinsgr2vr.w x2, t2, 2
|
vinsgr2vr.w x2, t2, 2
|
||||||
vinsgr2vr.w x1, t3, 3
|
vinsgr2vr.w x1, t3, 3
|
||||||
vinsgr2vr.w x2, t4, 3
|
vinsgr2vr.w x2, t4, 3
|
||||||
vpickev.w x1, VX1, VX0
|
|
||||||
vpickod.w x2, VX1, VX0
|
|
||||||
#endif
|
#endif
|
||||||
addi.d I, I, -1
|
addi.d I, I, -1
|
||||||
VFMUL x3, VI4, x1
|
VFMUL x3, VI4, x1
|
||||||
|
@ -358,12 +439,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
#ifdef DOUBLE
|
#ifdef DOUBLE
|
||||||
vfmina.d VM0, x1, x2
|
vfmina.d VM0, x1, x2
|
||||||
vfcmp.ceq.d VT0, x1, VM0
|
vfcmp.ceq.d VT0, x1, VM0
|
||||||
|
vbitsel.v VI0, VI2, VI1, VT0
|
||||||
#else
|
#else
|
||||||
fcmp.ceq.d $fcc0, $f15, $f10
|
fcmp.ceq.d $fcc0, $f15, $f10
|
||||||
bceqz $fcc0, .L27
|
bceqz $fcc0, .L27
|
||||||
vfcmp.clt.s VT0, VI2, VI0
|
vfcmp.clt.s VT0, VI2, VI0
|
||||||
#endif
|
|
||||||
vbitsel.v VI0, VI0, VI2, VT0
|
vbitsel.v VI0, VI0, VI2, VT0
|
||||||
|
#endif
|
||||||
.align 3
|
.align 3
|
||||||
|
|
||||||
.L27:
|
.L27:
|
||||||
|
|
Loading…
Reference in New Issue