Fixed #7. 1)Disable the multi-thread and 2) Modified kernel codes to avoid unloop in axpy function when incx==0 or incy==0.

This commit is contained in:
Xianyi Zhang 2011-02-21 00:24:21 +08:00
parent 109b86d00e
commit 0cfd29a819
6 changed files with 72 additions and 0 deletions

View File

@ -81,6 +81,11 @@ void CNAME(blasint n, FLOAT alpha, FLOAT *x, blasint incx, FLOAT *y, blasint inc
#ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
if (nthreads == 1) {
#endif

View File

@ -83,6 +83,11 @@ void CNAME(blasint n, FLOAT *ALPHA, FLOAT *x, blasint incx, FLOAT *y, blasint in
#ifdef SMP
nthreads = num_cpu_avail(1);
//disable multi-thread when incx==0 or incy==0
//In that case, the threads would be dependent.
if (incx == 0 || incy == 0)
nthreads = 1;
if (nthreads == 1) {
#endif

View File

@ -1463,6 +1463,12 @@
.L50:
movq M, %rax
movq Y, YY
//If incx==0 || incy==0, avoid unloop.
cmpq $0, INCX
je .L56
cmpq $0, INCY
je .L56
sarq $3, %rax
jle .L55
ALIGN_3

View File

@ -805,6 +805,12 @@
.L40:
movq Y, YY
movq M, %rax
//If incx==0 || incy==0, avoid unloop.
cmpq $0, INCX
je .L46
cmpq $0, INCY
je .L46
sarq $3, %rax
jle .L45
ALIGN_3

View File

@ -2893,6 +2893,12 @@
unpcklps %xmm13, %xmm15
#endif
//If incx==0 || incy==0, avoid unloop and jump to end.
cmpq $0, INCX
je .L200
cmpq $0, INCY
je .L200
movq Y, YY
movq M, %rax
@ -3105,8 +3111,42 @@
addps %xmm1, %xmm8
movsd %xmm8, (Y)
jmp .L999
ALIGN_3
.L200:
movq M, %rax
cmpq $0, %rax
jle .L999
ALIGN_3
.L201:
movsd (X), %xmm0
addq INCX, X
#ifdef HAVE_SSE3
movshdup %xmm0, %xmm1
movsldup %xmm0, %xmm0
#else
pshufd $0xf5, %xmm0, %xmm1
shufps $0xa0, %xmm0, %xmm0
#endif
mulps %xmm14, %xmm0
mulps %xmm15, %xmm1
movsd (Y), %xmm8
addps %xmm0, %xmm8
addps %xmm1, %xmm8
movsd %xmm8, (Y)
addq INCY, Y
decq %rax
jg .L201
ALIGN_3
.L999:
xorq %rax, %rax

View File

@ -1416,6 +1416,12 @@
movq Y, YY
movq M, %rax
//If incx==0 || incy==0, avoid unloop and jump to end.
cmpq $0, INCX
je .L58
cmpq $0, INCY
je .L58
sarq $3, %rax
jle .L55
@ -1769,6 +1775,7 @@
andq $1, %rax
jle .L999
.L58:
MOVDDUP( 0 * SIZE, X, %xmm0)
MOVDDUP( 1 * SIZE, X, %xmm1)
@ -1781,6 +1788,9 @@
movlpd %xmm8, 0 * SIZE(YY)
movhpd %xmm8, 1 * SIZE(YY)
decq %rax
jg .L58
ALIGN_3
.L999: