Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop

This commit is contained in:
Zhang Xianyi 2014-02-18 15:53:57 +08:00
commit dd2d3e61ab
7 changed files with 144 additions and 107 deletions

View File

@ -83,6 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ATTRIBUTE_SIZE 128 #define ATTRIBUTE_SIZE 128
extern void openblas_warning(int verbose, const char * msg);
/* This is a thread server model implementation. The threads are */ /* This is a thread server model implementation. The threads are */
/* spawned at first access to blas library, and still remains until */ /* spawned at first access to blas library, and still remains until */
/* destruction routine is called. The number of threads are */ /* destruction routine is called. The number of threads are */
@ -921,5 +923,17 @@ int BLASFUNC(blas_thread_shutdown)(void){
return 0; return 0;
} }
/*
https://github.com/xianyi/OpenBLAS/issues/294
Use pthread_atfork to close blas_thread_server before fork.
Then, re-init blas_thread_server after fork at child and parent.
*/
void openblas_fork_handler()
{
int err;
err = pthread_atfork (BLASFUNC(blas_thread_shutdown), blas_thread_init, blas_thread_init);
if(err != 0)
openblas_warning(0, "OpenBLAS cannot install fork handler. You may meet hang after fork.\n");
}
#endif #endif

View File

@ -315,4 +315,9 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
return 0; return 0;
} }
void openblas_fork_handler()
{
}
#endif #endif

View File

@ -498,3 +498,8 @@ void openblas_set_num_threads(int num)
{ {
goto_set_num_threads(num); goto_set_num_threads(num);
} }
void openblas_fork_handler()
{
}

View File

@ -1288,7 +1288,11 @@ void CONSTRUCTOR gotoblas_init(void) {
#ifdef SMP #ifdef SMP
if (blas_cpu_number == 0) blas_get_cpu_number(); if (blas_cpu_number == 0) blas_get_cpu_number();
#ifdef SMP_SERVER #ifdef SMP_SERVER
if (blas_server_avail == 0) blas_thread_init(); if (blas_server_avail == 0) {
blas_thread_init();
//deal with pthread and fork.
openblas_fork_handler();
}
#endif #endif
#endif #endif

View File

@ -40,10 +40,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = scopy_vfp.S SCOPYKERNEL = copy.c
DCOPYKERNEL = dcopy_vfp.S DCOPYKERNEL = copy.c
CCOPYKERNEL = ccopy_vfp.S CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy_vfp.S ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot_vfp.S SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S DDOTKERNEL = ddot_vfp.S

View File

@ -45,10 +45,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = scopy_vfp.S SCOPYKERNEL = copy.c
DCOPYKERNEL = dcopy_vfp.S DCOPYKERNEL = copy.c
CCOPYKERNEL = ccopy_vfp.S CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy_vfp.S ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot_vfp.S SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S DDOTKERNEL = ddot_vfp.S
@ -66,12 +66,12 @@ CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal_vfp.S SSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal_vfp.S DSCALKERNEL = scal.c
CSCALKERNEL = scal_vfp.S CSCALKERNEL = scal_vfp.S
ZSCALKERNEL = scal_vfp.S ZSCALKERNEL = scal_vfp.S
SGEMVNKERNEL = gemv_n_vfp.S SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S DGEMVNKERNEL = gemv_n.c
CGEMVNKERNEL = cgemv_n_vfp.S CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S ZGEMVNKERNEL = zgemv_n_vfp.S

View File

@ -111,6 +111,9 @@
#define MM M #define MM M
#endif #endif
#define TMP_M %r15
#define Y2 %rbx
PROLOGUE PROLOGUE
PROFCODE PROFCODE
@ -170,8 +173,9 @@
jge .L00t jge .L00t
movq MMM,M movq MMM,M
addq I,M addq M, I
jle .L999x jle .L999x
movq I, M
.L00t: .L00t:
movq XX,X movq XX,X
@ -2463,21 +2467,23 @@
cmpq Y, BUFFER cmpq Y, BUFFER
je .L999 je .L999
#endif #endif
movq M, TMP_M
movq Y, Y1
cmpq $SIZE, INCY cmpq $SIZE, INCY
jne .L950 jne .L950
testq $SIZE, Y testq $SIZE, Y1
je .L910 je .L910
movsd (Y), %xmm0 movsd (Y1), %xmm0
addsd (BUFFER), %xmm0 addsd (BUFFER), %xmm0
movsd %xmm0, (Y) movsd %xmm0, (Y1)
addq $SIZE, Y addq $SIZE, Y1
addq $SIZE, BUFFER addq $SIZE, BUFFER
decq M decq TMP_M
jle .L999 jle .L999
ALIGN_4 ALIGN_4
@ -2485,20 +2491,20 @@
testq $SIZE, BUFFER testq $SIZE, BUFFER
jne .L920 jne .L920
movq M, %rax movq TMP_M, %rax
sarq $3, %rax sarq $3, %rax
jle .L914 jle .L914
ALIGN_3 ALIGN_3
.L912: .L912:
#ifdef PREFETCHW #ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y) PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
#endif #endif
movapd 0 * SIZE(Y), %xmm0 movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(Y1), %xmm1
movapd 4 * SIZE(Y), %xmm2 movapd 4 * SIZE(Y1), %xmm2
movapd 6 * SIZE(Y), %xmm3 movapd 6 * SIZE(Y1), %xmm3
movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4
movapd 2 * SIZE(BUFFER), %xmm5 movapd 2 * SIZE(BUFFER), %xmm5
@ -2514,12 +2520,12 @@
addpd %xmm6, %xmm2 addpd %xmm6, %xmm2
addpd %xmm7, %xmm3 addpd %xmm7, %xmm3
movapd %xmm0, 0 * SIZE(Y) movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y) movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm2, 4 * SIZE(Y) movapd %xmm2, 4 * SIZE(Y1)
movapd %xmm3, 6 * SIZE(Y) movapd %xmm3, 6 * SIZE(Y1)
addq $8 * SIZE, Y addq $8 * SIZE, Y1
addq $8 * SIZE, BUFFER addq $8 * SIZE, BUFFER
decq %rax decq %rax
@ -2527,14 +2533,14 @@
ALIGN_3 ALIGN_3
.L914: .L914:
testq $7, M testq $7, TMP_M
jle .L999 jle .L999
testq $4, M testq $4, TMP_M
jle .L915 jle .L915
movapd 0 * SIZE(Y), %xmm0 movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(Y1), %xmm1
movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4
movapd 2 * SIZE(BUFFER), %xmm5 movapd 2 * SIZE(BUFFER), %xmm5
@ -2542,40 +2548,40 @@
addpd %xmm4, %xmm0 addpd %xmm4, %xmm0
addpd %xmm5, %xmm1 addpd %xmm5, %xmm1
movapd %xmm0, 0 * SIZE(Y) movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y) movapd %xmm1, 2 * SIZE(Y1)
addq $4 * SIZE, Y addq $4 * SIZE, Y1
addq $4 * SIZE, BUFFER addq $4 * SIZE, BUFFER
ALIGN_3 ALIGN_3
.L915: .L915:
testq $2, M testq $2, TMP_M
jle .L916 jle .L916
movapd (Y), %xmm0 movapd (Y1), %xmm0
movapd (BUFFER), %xmm4 movapd (BUFFER), %xmm4
addpd %xmm4, %xmm0 addpd %xmm4, %xmm0
movapd %xmm0, (Y) movapd %xmm0, (Y1)
addq $2 * SIZE, Y addq $2 * SIZE, Y1
addq $2 * SIZE, BUFFER addq $2 * SIZE, BUFFER
ALIGN_3 ALIGN_3
.L916: .L916:
testq $1, M testq $1, TMP_M
jle .L999 jle .L999
movsd (Y), %xmm0 movsd (Y1), %xmm0
movsd 0 * SIZE(BUFFER), %xmm4 movsd 0 * SIZE(BUFFER), %xmm4
addsd %xmm4, %xmm0 addsd %xmm4, %xmm0
movlpd %xmm0, (Y) movlpd %xmm0, (Y1)
ALIGN_3 ALIGN_3
jmp .L999 jmp .L999
@ -2584,20 +2590,20 @@
.L920: .L920:
movapd -1 * SIZE(BUFFER), %xmm4 movapd -1 * SIZE(BUFFER), %xmm4
movq M, %rax movq TMP_M, %rax
sarq $3, %rax sarq $3, %rax
jle .L924 jle .L924
ALIGN_3 ALIGN_3
.L922: .L922:
#ifdef PREFETCHW #ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y) PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
#endif #endif
movapd 0 * SIZE(Y), %xmm0 movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(Y1), %xmm1
movapd 4 * SIZE(Y), %xmm2 movapd 4 * SIZE(Y1), %xmm2
movapd 6 * SIZE(Y), %xmm3 movapd 6 * SIZE(Y1), %xmm3
movapd 1 * SIZE(BUFFER), %xmm5 movapd 1 * SIZE(BUFFER), %xmm5
movapd 3 * SIZE(BUFFER), %xmm6 movapd 3 * SIZE(BUFFER), %xmm6
@ -2618,14 +2624,14 @@
addpd %xmm6, %xmm2 addpd %xmm6, %xmm2
addpd %xmm7, %xmm3 addpd %xmm7, %xmm3
movapd %xmm0, 0 * SIZE(Y) movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y) movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm2, 4 * SIZE(Y) movapd %xmm2, 4 * SIZE(Y1)
movapd %xmm3, 6 * SIZE(Y) movapd %xmm3, 6 * SIZE(Y1)
movapd %xmm8, %xmm4 movapd %xmm8, %xmm4
addq $8 * SIZE, Y addq $8 * SIZE, Y1
addq $8 * SIZE, BUFFER addq $8 * SIZE, BUFFER
decq %rax decq %rax
@ -2633,14 +2639,14 @@
ALIGN_3 ALIGN_3
.L924: .L924:
testq $7, M testq $7, TMP_M
jle .L999 jle .L999
testq $4, M testq $4, TMP_M
jle .L925 jle .L925
movapd 0 * SIZE(Y), %xmm0 movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y), %xmm1 movapd 2 * SIZE(Y1), %xmm1
movapd 1 * SIZE(BUFFER), %xmm5 movapd 1 * SIZE(BUFFER), %xmm5
movapd 3 * SIZE(BUFFER), %xmm6 movapd 3 * SIZE(BUFFER), %xmm6
@ -2651,20 +2657,20 @@
addpd %xmm4, %xmm0 addpd %xmm4, %xmm0
addpd %xmm5, %xmm1 addpd %xmm5, %xmm1
movapd %xmm0, 0 * SIZE(Y) movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y) movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm6, %xmm4 movapd %xmm6, %xmm4
addq $4 * SIZE, Y addq $4 * SIZE, Y1
addq $4 * SIZE, BUFFER addq $4 * SIZE, BUFFER
ALIGN_3 ALIGN_3
.L925: .L925:
testq $2, M testq $2, TMP_M
jle .L926 jle .L926
movapd (Y), %xmm0 movapd (Y1), %xmm0
movapd 1 * SIZE(BUFFER), %xmm5 movapd 1 * SIZE(BUFFER), %xmm5
@ -2672,25 +2678,25 @@
addpd %xmm4, %xmm0 addpd %xmm4, %xmm0
movapd %xmm0, (Y) movapd %xmm0, (Y1)
movaps %xmm5, %xmm4 movaps %xmm5, %xmm4
addq $2 * SIZE, Y addq $2 * SIZE, Y1
addq $2 * SIZE, BUFFER addq $2 * SIZE, BUFFER
ALIGN_3 ALIGN_3
.L926: .L926:
testq $1, M testq $1, TMP_M
jle .L999 jle .L999
movsd (Y), %xmm0 movsd (Y1), %xmm0
shufpd $1, %xmm4, %xmm4 shufpd $1, %xmm4, %xmm4
addsd %xmm4, %xmm0 addsd %xmm4, %xmm0
movlpd %xmm0, (Y) movlpd %xmm0, (Y1)
ALIGN_3 ALIGN_3
jmp .L999 jmp .L999
@ -2700,53 +2706,53 @@
testq $SIZE, BUFFER testq $SIZE, BUFFER
je .L960 je .L960
movsd (Y), %xmm0 movsd (Y1), %xmm0
addsd (BUFFER), %xmm0 addsd (BUFFER), %xmm0
movsd %xmm0, (Y) movsd %xmm0, (Y1)
addq INCY, Y addq INCY, Y1
addq $SIZE, BUFFER addq $SIZE, BUFFER
decq M decq TMP_M
jle .L999 jle .L999
ALIGN_4 ALIGN_4
.L960: .L960:
movq Y, Y1 movq Y1, Y2
movq M, %rax movq TMP_M, %rax
sarq $3, %rax sarq $3, %rax
jle .L964 jle .L964
ALIGN_3 ALIGN_3
.L962: .L962:
movsd (Y), %xmm0 movsd (Y2), %xmm0
addq INCY, Y addq INCY, Y2
movhpd (Y), %xmm0 movhpd (Y2), %xmm0
addq INCY, Y addq INCY, Y2
movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4
movsd (Y), %xmm1 movsd (Y2), %xmm1
addq INCY, Y addq INCY, Y2
movhpd (Y), %xmm1 movhpd (Y2), %xmm1
addq INCY, Y addq INCY, Y2
movapd 2 * SIZE(BUFFER), %xmm5 movapd 2 * SIZE(BUFFER), %xmm5
movsd (Y), %xmm2 movsd (Y2), %xmm2
addq INCY, Y addq INCY, Y2
movhpd (Y), %xmm2 movhpd (Y2), %xmm2
addq INCY, Y addq INCY, Y2
movapd 4 * SIZE(BUFFER), %xmm6 movapd 4 * SIZE(BUFFER), %xmm6
addpd %xmm4, %xmm0 addpd %xmm4, %xmm0
movsd (Y), %xmm3 movsd (Y2), %xmm3
addq INCY, Y addq INCY, Y2
movhpd (Y), %xmm3 movhpd (Y2), %xmm3
addq INCY, Y addq INCY, Y2
movapd 6 * SIZE(BUFFER), %xmm7 movapd 6 * SIZE(BUFFER), %xmm7
@ -2781,23 +2787,23 @@
ALIGN_3 ALIGN_3
.L964: .L964:
testq $7, M testq $7, TMP_M
jle .L999 jle .L999
testq $4, M testq $4, TMP_M
jle .L965 jle .L965
movsd (Y), %xmm0 movsd (Y2), %xmm0
addq INCY, Y addq INCY, Y2
movhpd (Y), %xmm0 movhpd (Y2), %xmm0
addq INCY, Y addq INCY, Y2
movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4
movsd (Y), %xmm1 movsd (Y2), %xmm1
addq INCY, Y addq INCY, Y2
movhpd (Y), %xmm1 movhpd (Y2), %xmm1
addq INCY, Y addq INCY, Y2
movapd 2 * SIZE(BUFFER), %xmm5 movapd 2 * SIZE(BUFFER), %xmm5
@ -2817,13 +2823,13 @@
ALIGN_3 ALIGN_3
.L965: .L965:
testq $2, M testq $2, TMP_M
jle .L966 jle .L966
movsd (Y), %xmm0 movsd (Y2), %xmm0
addq INCY, Y addq INCY, Y2
movhpd (Y), %xmm0 movhpd (Y2), %xmm0
addq INCY, Y addq INCY, Y2
movapd 0 * SIZE(BUFFER), %xmm4 movapd 0 * SIZE(BUFFER), %xmm4
@ -2838,10 +2844,10 @@
ALIGN_3 ALIGN_3
.L966: .L966:
testq $1, M testq $1, TMP_M
jle .L999 jle .L999
movsd (Y), %xmm0 movsd (Y2), %xmm0
movsd 0 * SIZE(BUFFER), %xmm4 movsd 0 * SIZE(BUFFER), %xmm4
@ -2853,6 +2859,9 @@
.L999: .L999:
leaq (, M, SIZE), %rax leaq (, M, SIZE), %rax
addq %rax,AA addq %rax,AA
movq STACK_INCY, INCY
imulq INCY, %rax
addq %rax, Y
jmp .L0t jmp .L0t
ALIGN_4 ALIGN_4