diff --git a/driver/others/blas_server.c b/driver/others/blas_server.c index 2afcb742e..cc1015b7e 100644 --- a/driver/others/blas_server.c +++ b/driver/others/blas_server.c @@ -83,6 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define ATTRIBUTE_SIZE 128 +extern void openblas_warning(int verbose, const char * msg); + /* This is a thread server model implementation. The threads are */ /* spawned at first access to blas library, and still remains until */ /* destruction routine is called. The number of threads are */ @@ -921,5 +923,17 @@ int BLASFUNC(blas_thread_shutdown)(void){ return 0; } +/* +https://github.com/xianyi/OpenBLAS/issues/294 +Use pthread_atfork to close blas_thread_server before fork. +Then, re-init blas_thread_server after fork at child and parent. +*/ +void openblas_fork_handler() +{ + int err; + err = pthread_atfork (BLASFUNC(blas_thread_shutdown), blas_thread_init, blas_thread_init); + if(err != 0) + openblas_warning(0, "OpenBLAS cannot install fork handler. You may meet hang after fork.\n"); +} #endif diff --git a/driver/others/blas_server_omp.c b/driver/others/blas_server_omp.c index 0a484f3e4..090590e6a 100644 --- a/driver/others/blas_server_omp.c +++ b/driver/others/blas_server_omp.c @@ -315,4 +315,9 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){ return 0; } +void openblas_fork_handler() +{ + +} + #endif diff --git a/driver/others/blas_server_win32.c b/driver/others/blas_server_win32.c index 100ca34f7..68630a679 100644 --- a/driver/others/blas_server_win32.c +++ b/driver/others/blas_server_win32.c @@ -498,3 +498,8 @@ void openblas_set_num_threads(int num) { goto_set_num_threads(num); } + +void openblas_fork_handler() +{ + +} diff --git a/driver/others/memory.c b/driver/others/memory.c index 35758d13c..4faf82f29 100644 --- a/driver/others/memory.c +++ b/driver/others/memory.c @@ -1288,7 +1288,11 @@ void CONSTRUCTOR gotoblas_init(void) { #ifdef SMP if (blas_cpu_number == 0) blas_get_cpu_number(); #ifdef SMP_SERVER - if (blas_server_avail == 0) blas_thread_init(); + if (blas_server_avail == 0) { + blas_thread_init(); + //deal with pthread and fork. + openblas_fork_handler(); + } #endif #endif diff --git a/kernel/arm/KERNEL.ARMV6 b/kernel/arm/KERNEL.ARMV6 index f47a843f3..02627cbd0 100644 --- a/kernel/arm/KERNEL.ARMV6 +++ b/kernel/arm/KERNEL.ARMV6 @@ -40,10 +40,10 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S -SCOPYKERNEL = scopy_vfp.S -DCOPYKERNEL = dcopy_vfp.S -CCOPYKERNEL = ccopy_vfp.S -ZCOPYKERNEL = zcopy_vfp.S +SCOPYKERNEL = copy.c +DCOPYKERNEL = copy.c +CCOPYKERNEL = zcopy.c +ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S diff --git a/kernel/arm/KERNEL.ARMV7 b/kernel/arm/KERNEL.ARMV7 index 507f9813c..df66388db 100644 --- a/kernel/arm/KERNEL.ARMV7 +++ b/kernel/arm/KERNEL.ARMV7 @@ -45,10 +45,10 @@ DAXPYKERNEL = axpy_vfp.S CAXPYKERNEL = axpy_vfp.S ZAXPYKERNEL = axpy_vfp.S -SCOPYKERNEL = scopy_vfp.S -DCOPYKERNEL = dcopy_vfp.S -CCOPYKERNEL = ccopy_vfp.S -ZCOPYKERNEL = zcopy_vfp.S +SCOPYKERNEL = copy.c +DCOPYKERNEL = copy.c +CCOPYKERNEL = zcopy.c +ZCOPYKERNEL = zcopy.c SDOTKERNEL = sdot_vfp.S DDOTKERNEL = ddot_vfp.S @@ -66,12 +66,12 @@ CROTKERNEL = rot_vfp.S ZROTKERNEL = rot_vfp.S SSCALKERNEL = scal_vfp.S -DSCALKERNEL = scal_vfp.S +DSCALKERNEL = scal.c CSCALKERNEL = scal_vfp.S ZSCALKERNEL = scal_vfp.S SGEMVNKERNEL = gemv_n_vfp.S -DGEMVNKERNEL = gemv_n_vfp.S +DGEMVNKERNEL = gemv_n.c CGEMVNKERNEL = cgemv_n_vfp.S ZGEMVNKERNEL = zgemv_n_vfp.S diff --git a/kernel/x86_64/dgemv_n.S b/kernel/x86_64/dgemv_n.S index 65e228584..5f4c40467 100644 --- a/kernel/x86_64/dgemv_n.S +++ b/kernel/x86_64/dgemv_n.S @@ -111,6 +111,9 @@ #define MM M #endif +#define TMP_M %r15 +#define Y2 %rbx + PROLOGUE PROFCODE @@ -170,8 +173,9 @@ jge .L00t movq MMM,M - addq I,M + addq M, I jle .L999x + movq I, M .L00t: movq XX,X @@ -2463,21 +2467,23 @@ cmpq Y, BUFFER je .L999 #endif - + movq M, TMP_M + movq Y, Y1 + cmpq $SIZE, INCY jne .L950 - testq $SIZE, Y + testq $SIZE, Y1 je .L910 - movsd (Y), %xmm0 + movsd (Y1), %xmm0 addsd (BUFFER), %xmm0 - movsd %xmm0, (Y) + movsd %xmm0, (Y1) - addq $SIZE, Y + addq $SIZE, Y1 addq $SIZE, BUFFER - decq M + decq TMP_M jle .L999 ALIGN_4 @@ -2485,20 +2491,20 @@ testq $SIZE, BUFFER jne .L920 - movq M, %rax + movq TMP_M, %rax sarq $3, %rax jle .L914 ALIGN_3 .L912: #ifdef PREFETCHW - PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y) + PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1) #endif - movapd 0 * SIZE(Y), %xmm0 - movapd 2 * SIZE(Y), %xmm1 - movapd 4 * SIZE(Y), %xmm2 - movapd 6 * SIZE(Y), %xmm3 + movapd 0 * SIZE(Y1), %xmm0 + movapd 2 * SIZE(Y1), %xmm1 + movapd 4 * SIZE(Y1), %xmm2 + movapd 6 * SIZE(Y1), %xmm3 movapd 0 * SIZE(BUFFER), %xmm4 movapd 2 * SIZE(BUFFER), %xmm5 @@ -2514,12 +2520,12 @@ addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 - movapd %xmm0, 0 * SIZE(Y) - movapd %xmm1, 2 * SIZE(Y) - movapd %xmm2, 4 * SIZE(Y) - movapd %xmm3, 6 * SIZE(Y) + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) - addq $8 * SIZE, Y + addq $8 * SIZE, Y1 addq $8 * SIZE, BUFFER decq %rax @@ -2527,14 +2533,14 @@ ALIGN_3 .L914: - testq $7, M + testq $7, TMP_M jle .L999 - testq $4, M + testq $4, TMP_M jle .L915 - movapd 0 * SIZE(Y), %xmm0 - movapd 2 * SIZE(Y), %xmm1 + movapd 0 * SIZE(Y1), %xmm0 + movapd 2 * SIZE(Y1), %xmm1 movapd 0 * SIZE(BUFFER), %xmm4 movapd 2 * SIZE(BUFFER), %xmm5 @@ -2542,40 +2548,40 @@ addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 - movapd %xmm0, 0 * SIZE(Y) - movapd %xmm1, 2 * SIZE(Y) + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) - addq $4 * SIZE, Y + addq $4 * SIZE, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L915: - testq $2, M + testq $2, TMP_M jle .L916 - movapd (Y), %xmm0 + movapd (Y1), %xmm0 movapd (BUFFER), %xmm4 addpd %xmm4, %xmm0 - movapd %xmm0, (Y) + movapd %xmm0, (Y1) - addq $2 * SIZE, Y + addq $2 * SIZE, Y1 addq $2 * SIZE, BUFFER ALIGN_3 .L916: - testq $1, M + testq $1, TMP_M jle .L999 - movsd (Y), %xmm0 + movsd (Y1), %xmm0 movsd 0 * SIZE(BUFFER), %xmm4 addsd %xmm4, %xmm0 - movlpd %xmm0, (Y) + movlpd %xmm0, (Y1) ALIGN_3 jmp .L999 @@ -2584,20 +2590,20 @@ .L920: movapd -1 * SIZE(BUFFER), %xmm4 - movq M, %rax + movq TMP_M, %rax sarq $3, %rax jle .L924 ALIGN_3 .L922: #ifdef PREFETCHW - PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y) + PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1) #endif - movapd 0 * SIZE(Y), %xmm0 - movapd 2 * SIZE(Y), %xmm1 - movapd 4 * SIZE(Y), %xmm2 - movapd 6 * SIZE(Y), %xmm3 + movapd 0 * SIZE(Y1), %xmm0 + movapd 2 * SIZE(Y1), %xmm1 + movapd 4 * SIZE(Y1), %xmm2 + movapd 6 * SIZE(Y1), %xmm3 movapd 1 * SIZE(BUFFER), %xmm5 movapd 3 * SIZE(BUFFER), %xmm6 @@ -2618,14 +2624,14 @@ addpd %xmm6, %xmm2 addpd %xmm7, %xmm3 - movapd %xmm0, 0 * SIZE(Y) - movapd %xmm1, 2 * SIZE(Y) - movapd %xmm2, 4 * SIZE(Y) - movapd %xmm3, 6 * SIZE(Y) + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) + movapd %xmm2, 4 * SIZE(Y1) + movapd %xmm3, 6 * SIZE(Y1) movapd %xmm8, %xmm4 - addq $8 * SIZE, Y + addq $8 * SIZE, Y1 addq $8 * SIZE, BUFFER decq %rax @@ -2633,14 +2639,14 @@ ALIGN_3 .L924: - testq $7, M + testq $7, TMP_M jle .L999 - testq $4, M + testq $4, TMP_M jle .L925 - movapd 0 * SIZE(Y), %xmm0 - movapd 2 * SIZE(Y), %xmm1 + movapd 0 * SIZE(Y1), %xmm0 + movapd 2 * SIZE(Y1), %xmm1 movapd 1 * SIZE(BUFFER), %xmm5 movapd 3 * SIZE(BUFFER), %xmm6 @@ -2651,20 +2657,20 @@ addpd %xmm4, %xmm0 addpd %xmm5, %xmm1 - movapd %xmm0, 0 * SIZE(Y) - movapd %xmm1, 2 * SIZE(Y) + movapd %xmm0, 0 * SIZE(Y1) + movapd %xmm1, 2 * SIZE(Y1) movapd %xmm6, %xmm4 - addq $4 * SIZE, Y + addq $4 * SIZE, Y1 addq $4 * SIZE, BUFFER ALIGN_3 .L925: - testq $2, M + testq $2, TMP_M jle .L926 - movapd (Y), %xmm0 + movapd (Y1), %xmm0 movapd 1 * SIZE(BUFFER), %xmm5 @@ -2672,25 +2678,25 @@ addpd %xmm4, %xmm0 - movapd %xmm0, (Y) + movapd %xmm0, (Y1) movaps %xmm5, %xmm4 - addq $2 * SIZE, Y + addq $2 * SIZE, Y1 addq $2 * SIZE, BUFFER ALIGN_3 .L926: - testq $1, M + testq $1, TMP_M jle .L999 - movsd (Y), %xmm0 + movsd (Y1), %xmm0 shufpd $1, %xmm4, %xmm4 addsd %xmm4, %xmm0 - movlpd %xmm0, (Y) + movlpd %xmm0, (Y1) ALIGN_3 jmp .L999 @@ -2700,53 +2706,53 @@ testq $SIZE, BUFFER je .L960 - movsd (Y), %xmm0 + movsd (Y1), %xmm0 addsd (BUFFER), %xmm0 - movsd %xmm0, (Y) + movsd %xmm0, (Y1) - addq INCY, Y + addq INCY, Y1 addq $SIZE, BUFFER - decq M + decq TMP_M jle .L999 ALIGN_4 .L960: - movq Y, Y1 + movq Y1, Y2 - movq M, %rax + movq TMP_M, %rax sarq $3, %rax jle .L964 ALIGN_3 .L962: - movsd (Y), %xmm0 - addq INCY, Y - movhpd (Y), %xmm0 - addq INCY, Y + movsd (Y2), %xmm0 + addq INCY, Y2 + movhpd (Y2), %xmm0 + addq INCY, Y2 movapd 0 * SIZE(BUFFER), %xmm4 - movsd (Y), %xmm1 - addq INCY, Y - movhpd (Y), %xmm1 - addq INCY, Y + movsd (Y2), %xmm1 + addq INCY, Y2 + movhpd (Y2), %xmm1 + addq INCY, Y2 movapd 2 * SIZE(BUFFER), %xmm5 - movsd (Y), %xmm2 - addq INCY, Y - movhpd (Y), %xmm2 - addq INCY, Y + movsd (Y2), %xmm2 + addq INCY, Y2 + movhpd (Y2), %xmm2 + addq INCY, Y2 movapd 4 * SIZE(BUFFER), %xmm6 addpd %xmm4, %xmm0 - movsd (Y), %xmm3 - addq INCY, Y - movhpd (Y), %xmm3 - addq INCY, Y + movsd (Y2), %xmm3 + addq INCY, Y2 + movhpd (Y2), %xmm3 + addq INCY, Y2 movapd 6 * SIZE(BUFFER), %xmm7 @@ -2781,23 +2787,23 @@ ALIGN_3 .L964: - testq $7, M + testq $7, TMP_M jle .L999 - testq $4, M + testq $4, TMP_M jle .L965 - movsd (Y), %xmm0 - addq INCY, Y - movhpd (Y), %xmm0 - addq INCY, Y + movsd (Y2), %xmm0 + addq INCY, Y2 + movhpd (Y2), %xmm0 + addq INCY, Y2 movapd 0 * SIZE(BUFFER), %xmm4 - movsd (Y), %xmm1 - addq INCY, Y - movhpd (Y), %xmm1 - addq INCY, Y + movsd (Y2), %xmm1 + addq INCY, Y2 + movhpd (Y2), %xmm1 + addq INCY, Y2 movapd 2 * SIZE(BUFFER), %xmm5 @@ -2817,13 +2823,13 @@ ALIGN_3 .L965: - testq $2, M + testq $2, TMP_M jle .L966 - movsd (Y), %xmm0 - addq INCY, Y - movhpd (Y), %xmm0 - addq INCY, Y + movsd (Y2), %xmm0 + addq INCY, Y2 + movhpd (Y2), %xmm0 + addq INCY, Y2 movapd 0 * SIZE(BUFFER), %xmm4 @@ -2838,10 +2844,10 @@ ALIGN_3 .L966: - testq $1, M + testq $1, TMP_M jle .L999 - movsd (Y), %xmm0 + movsd (Y2), %xmm0 movsd 0 * SIZE(BUFFER), %xmm4 @@ -2853,6 +2859,9 @@ .L999: leaq (, M, SIZE), %rax addq %rax,AA + movq STACK_INCY, INCY + imulq INCY, %rax + addq %rax, Y jmp .L0t ALIGN_4