Fixed #7. Modified axpy kernel codes to avoid unloop with incx==0 or incy==0 in x86 32bits arch.

This commit is contained in:
Xianyi 2011-02-23 20:08:34 +08:00
parent cd2cbabecc
commit 12214e1d0f
4 changed files with 61 additions and 0 deletions

View File

@ -1440,6 +1440,12 @@
.L50:
movl M, %eax
movl Y, YY
//If incx==0 || incy==0, avoid unloop.
cmpl $0, INCX
je .L56
cmpl $0, INCY
je .L56
sarl $3, %eax
jle .L55
ALIGN_3

View File

@ -698,6 +698,12 @@
.L40:
movl Y, YY
movl M, %eax
//If incx==0 || incy==0, avoid unloop.
cmpl $0, INCX
je .L46
cmpl $0, INCY
je .L46
sarl $3, %eax
jle .L45
ALIGN_3

View File

@ -2857,6 +2857,11 @@
unpcklps ALPHA_I, ALPHA_R
unpcklps %xmm5, ALPHA_I
#endif
//If incx==0 || incy==0, avoid unloop and jump to end.
cmpl $0, INCX
je .L200
cmpl $0, INCY
je .L200
movl Y, YY
@ -3090,8 +3095,41 @@
addps %xmm1, %xmm4
movsd %xmm4, (Y)
jmp .L999
ALIGN_3
.L200:
movl M, %eax
cmpl $0, %eax
jle .L999
ALIGN_3
.L201:
movsd (X), %xmm0
#ifdef HAVE_SSE3
movshdup %xmm0, %xmm1
movsldup %xmm0, %xmm0
#else
movaps %xmm0, %xmm1
shufps $0xa0, %xmm0, %xmm0
shufps $0xf5, %xmm1, %xmm1
#endif
mulps ALPHA_R, %xmm0
mulps ALPHA_I, %xmm1
movsd (Y), %xmm4
addps %xmm0, %xmm4
addps %xmm1, %xmm4
movsd %xmm4, (Y)
decl %eax
jg .L201
ALIGN_3
.L999:
popl %ebp
popl %ebx

View File

@ -1318,6 +1318,12 @@
movl Y, YY
movl M, %eax
//If incx==0 || incy==0, avoid unloop and jump to end.
cmpl $0, INCX
je .L58
cmpl $0, INCY
je .L58
sarl $2, %eax
jle .L55
@ -1498,6 +1504,7 @@
andl $1, %eax
jle .L999
.L58:
MOVDDUP( 0 * SIZE, X, %xmm0)
MOVDDUP( 1 * SIZE, X, %xmm1)
@ -1510,6 +1517,10 @@
movlpd %xmm4, 0 * SIZE(YY)
movhpd %xmm4, 1 * SIZE(YY)
decl %eax
jg .L58
ALIGN_3
.L999: