diff --git a/kernel/x86/axpy_sse.S b/kernel/x86/axpy_sse.S index 291a219ce..e06d90184 100644 --- a/kernel/x86/axpy_sse.S +++ b/kernel/x86/axpy_sse.S @@ -1440,6 +1440,12 @@ .L50: movl M, %eax movl Y, YY +//If incx==0 || incy==0, avoid unloop. + cmpl $0, INCX + je .L56 + cmpl $0, INCY + je .L56 + sarl $3, %eax jle .L55 ALIGN_3 diff --git a/kernel/x86/axpy_sse2.S b/kernel/x86/axpy_sse2.S index 5e31d3dba..9b2d5d808 100644 --- a/kernel/x86/axpy_sse2.S +++ b/kernel/x86/axpy_sse2.S @@ -698,6 +698,12 @@ .L40: movl Y, YY movl M, %eax +//If incx==0 || incy==0, avoid unloop. + cmpl $0, INCX + je .L46 + cmpl $0, INCY + je .L46 + sarl $3, %eax jle .L45 ALIGN_3 diff --git a/kernel/x86/zaxpy_sse.S b/kernel/x86/zaxpy_sse.S index edd9929cd..9c94cec44 100644 --- a/kernel/x86/zaxpy_sse.S +++ b/kernel/x86/zaxpy_sse.S @@ -2857,6 +2857,11 @@ unpcklps ALPHA_I, ALPHA_R unpcklps %xmm5, ALPHA_I #endif +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpl $0, INCX + je .L200 + cmpl $0, INCY + je .L200 movl Y, YY @@ -3090,8 +3095,41 @@ addps %xmm1, %xmm4 movsd %xmm4, (Y) + jmp .L999 ALIGN_3 +.L200: + movl M, %eax + cmpl $0, %eax + jle .L999 + ALIGN_3 + +.L201: + movsd (X), %xmm0 + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + movaps %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 + shufps $0xf5, %xmm1, %xmm1 +#endif + + mulps ALPHA_R, %xmm0 + mulps ALPHA_I, %xmm1 + + movsd (Y), %xmm4 + + addps %xmm0, %xmm4 + addps %xmm1, %xmm4 + + movsd %xmm4, (Y) + + decl %eax + jg .L201 + + ALIGN_3 .L999: popl %ebp popl %ebx diff --git a/kernel/x86/zaxpy_sse2.S b/kernel/x86/zaxpy_sse2.S index 40afdc3fc..9c2caa7e8 100644 --- a/kernel/x86/zaxpy_sse2.S +++ b/kernel/x86/zaxpy_sse2.S @@ -1318,6 +1318,12 @@ movl Y, YY movl M, %eax +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpl $0, INCX + je .L58 + cmpl $0, INCY + je .L58 + sarl $2, %eax jle .L55 @@ -1498,6 +1504,7 @@ andl $1, %eax jle .L999 +.L58: MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) @@ -1510,6 +1517,10 @@ movlpd %xmm4, 0 * SIZE(YY) movhpd %xmm4, 1 * SIZE(YY) + + + decl %eax + jg .L58 ALIGN_3 .L999: