diff --git a/interface/axpy.c b/interface/axpy.c index 03b981985..dd75b758c 100644 --- a/interface/axpy.c +++ b/interface/axpy.c @@ -81,6 +81,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc #ifdef SMP nthreads = num_cpu_avail(1); + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/interface/zaxpy.c b/interface/zaxpy.c index d3355ea57..9ed72efb9 100644 --- a/interface/zaxpy.c +++ b/interface/zaxpy.c @@ -83,6 +83,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in #ifdef SMP nthreads = num_cpu_avail(1); + //disable multi-thread when incx==0 or incy==0 + //In that case, the threads would be dependent. + if (incx == 0 || incy == 0) + nthreads = 1; + if (nthreads == 1) { #endif diff --git a/kernel/x86_64/axpy_sse.S b/kernel/x86_64/axpy_sse.S index 23c2ec54e..9a7512575 100644 --- a/kernel/x86_64/axpy_sse.S +++ b/kernel/x86_64/axpy_sse.S @@ -1463,6 +1463,12 @@ .L50: movq M, %rax movq Y, YY +//If incx==0 || incy==0, avoid unloop. + cmpq $0, INCX + je .L56 + cmpq $0, INCY + je .L56 + sarq $3, %rax jle .L55 ALIGN_3 diff --git a/kernel/x86_64/axpy_sse2.S b/kernel/x86_64/axpy_sse2.S index 554602917..dea8d0382 100644 --- a/kernel/x86_64/axpy_sse2.S +++ b/kernel/x86_64/axpy_sse2.S @@ -805,6 +805,12 @@ .L40: movq Y, YY movq M, %rax +//If incx==0 || incy==0, avoid unloop. + cmpq $0, INCX + je .L46 + cmpq $0, INCY + je .L46 + sarq $3, %rax jle .L45 ALIGN_3 diff --git a/kernel/x86_64/zaxpy_sse.S b/kernel/x86_64/zaxpy_sse.S index 69cdedaaa..42b920cfb 100644 --- a/kernel/x86_64/zaxpy_sse.S +++ b/kernel/x86_64/zaxpy_sse.S @@ -2893,6 +2893,12 @@ unpcklps %xmm13, %xmm15 #endif +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpq $0, INCX + je .L200 + cmpq $0, INCY + je .L200 + movq Y, YY movq M, %rax @@ -3105,8 +3111,42 @@ addps %xmm1, %xmm8 movsd %xmm8, (Y) + jmp .L999 ALIGN_3 + +.L200: + movq M, %rax + cmpq $0, %rax + jle .L999 + ALIGN_3 +.L201: + movsd (X), %xmm0 + addq INCX, X + +#ifdef HAVE_SSE3 + movshdup %xmm0, %xmm1 + movsldup %xmm0, %xmm0 +#else + pshufd $0xf5, %xmm0, %xmm1 + shufps $0xa0, %xmm0, %xmm0 +#endif + + mulps %xmm14, %xmm0 + mulps %xmm15, %xmm1 + + movsd (Y), %xmm8 + + addps %xmm0, %xmm8 + addps %xmm1, %xmm8 + + movsd %xmm8, (Y) + addq INCY, Y + + decq %rax + jg .L201 + ALIGN_3 + .L999: xorq %rax, %rax diff --git a/kernel/x86_64/zaxpy_sse2.S b/kernel/x86_64/zaxpy_sse2.S index f1616e362..1b7e3a563 100644 --- a/kernel/x86_64/zaxpy_sse2.S +++ b/kernel/x86_64/zaxpy_sse2.S @@ -1416,6 +1416,12 @@ movq Y, YY movq M, %rax +//If incx==0 || incy==0, avoid unloop and jump to end. + cmpq $0, INCX + je .L58 + cmpq $0, INCY + je .L58 + sarq $3, %rax jle .L55 @@ -1769,6 +1775,7 @@ andq $1, %rax jle .L999 +.L58: MOVDDUP( 0 * SIZE, X, %xmm0) MOVDDUP( 1 * SIZE, X, %xmm1) @@ -1781,6 +1788,9 @@ movlpd %xmm8, 0 * SIZE(YY) movhpd %xmm8, 1 * SIZE(YY) + + decq %rax + jg .L58 ALIGN_3 .L999: