Fixed #7. Modified axpy kernel codes to avoid unloop with incx==0 or incy==0 in x86 32bits arch.
This commit is contained in:
parent
cd2cbabecc
commit
12214e1d0f
|
@ -1440,6 +1440,12 @@
|
|||
.L50:
|
||||
movl M, %eax
|
||||
movl Y, YY
|
||||
//If incx==0 || incy==0, avoid unloop.
|
||||
cmpl $0, INCX
|
||||
je .L56
|
||||
cmpl $0, INCY
|
||||
je .L56
|
||||
|
||||
sarl $3, %eax
|
||||
jle .L55
|
||||
ALIGN_3
|
||||
|
|
|
@ -698,6 +698,12 @@
|
|||
.L40:
|
||||
movl Y, YY
|
||||
movl M, %eax
|
||||
//If incx==0 || incy==0, avoid unloop.
|
||||
cmpl $0, INCX
|
||||
je .L46
|
||||
cmpl $0, INCY
|
||||
je .L46
|
||||
|
||||
sarl $3, %eax
|
||||
jle .L45
|
||||
ALIGN_3
|
||||
|
|
|
@ -2857,6 +2857,11 @@
|
|||
unpcklps ALPHA_I, ALPHA_R
|
||||
unpcklps %xmm5, ALPHA_I
|
||||
#endif
|
||||
//If incx==0 || incy==0, avoid unloop and jump to end.
|
||||
cmpl $0, INCX
|
||||
je .L200
|
||||
cmpl $0, INCY
|
||||
je .L200
|
||||
|
||||
movl Y, YY
|
||||
|
||||
|
@ -3090,8 +3095,41 @@
|
|||
addps %xmm1, %xmm4
|
||||
|
||||
movsd %xmm4, (Y)
|
||||
jmp .L999
|
||||
ALIGN_3
|
||||
|
||||
.L200:
|
||||
movl M, %eax
|
||||
cmpl $0, %eax
|
||||
jle .L999
|
||||
ALIGN_3
|
||||
|
||||
.L201:
|
||||
movsd (X), %xmm0
|
||||
|
||||
#ifdef HAVE_SSE3
|
||||
movshdup %xmm0, %xmm1
|
||||
movsldup %xmm0, %xmm0
|
||||
#else
|
||||
movaps %xmm0, %xmm1
|
||||
shufps $0xa0, %xmm0, %xmm0
|
||||
shufps $0xf5, %xmm1, %xmm1
|
||||
#endif
|
||||
|
||||
mulps ALPHA_R, %xmm0
|
||||
mulps ALPHA_I, %xmm1
|
||||
|
||||
movsd (Y), %xmm4
|
||||
|
||||
addps %xmm0, %xmm4
|
||||
addps %xmm1, %xmm4
|
||||
|
||||
movsd %xmm4, (Y)
|
||||
|
||||
decl %eax
|
||||
jg .L201
|
||||
|
||||
ALIGN_3
|
||||
.L999:
|
||||
popl %ebp
|
||||
popl %ebx
|
||||
|
|
|
@ -1318,6 +1318,12 @@
|
|||
|
||||
movl Y, YY
|
||||
movl M, %eax
|
||||
//If incx==0 || incy==0, avoid unloop and jump to end.
|
||||
cmpl $0, INCX
|
||||
je .L58
|
||||
cmpl $0, INCY
|
||||
je .L58
|
||||
|
||||
sarl $2, %eax
|
||||
jle .L55
|
||||
|
||||
|
@ -1498,6 +1504,7 @@
|
|||
andl $1, %eax
|
||||
jle .L999
|
||||
|
||||
.L58:
|
||||
MOVDDUP( 0 * SIZE, X, %xmm0)
|
||||
MOVDDUP( 1 * SIZE, X, %xmm1)
|
||||
|
||||
|
@ -1510,6 +1517,10 @@
|
|||
|
||||
movlpd %xmm4, 0 * SIZE(YY)
|
||||
movhpd %xmm4, 1 * SIZE(YY)
|
||||
|
||||
|
||||
decl %eax
|
||||
jg .L58
|
||||
ALIGN_3
|
||||
|
||||
.L999:
|
||||
|
|
Loading…
Reference in New Issue