Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop

This commit is contained in:
Zhang Xianyi 2014-02-18 15:53:57 +08:00
commit dd2d3e61ab
7 changed files with 144 additions and 107 deletions

View File

@ -83,6 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define ATTRIBUTE_SIZE 128
extern void openblas_warning(int verbose, const char * msg);
/* This is a thread server model implementation. The threads are */
/* spawned at first access to blas library, and still remains until */
/* destruction routine is called. The number of threads are */
@ -921,5 +923,17 @@ int BLASFUNC(blas_thread_shutdown)(void){
return 0;
}
/*
https://github.com/xianyi/OpenBLAS/issues/294
Use pthread_atfork to close blas_thread_server before fork.
Then, re-init blas_thread_server after fork at child and parent.
*/
void openblas_fork_handler()
{
int err;
err = pthread_atfork (BLASFUNC(blas_thread_shutdown), blas_thread_init, blas_thread_init);
if(err != 0)
openblas_warning(0, "OpenBLAS cannot install fork handler. You may meet hang after fork.\n");
}
#endif

View File

@ -315,4 +315,9 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
return 0;
}
void openblas_fork_handler()
{
}
#endif

View File

@ -498,3 +498,8 @@ void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}
void openblas_fork_handler()
{
}

View File

@ -1288,7 +1288,11 @@ void CONSTRUCTOR gotoblas_init(void) {
#ifdef SMP
if (blas_cpu_number == 0) blas_get_cpu_number();
#ifdef SMP_SERVER
if (blas_server_avail == 0) blas_thread_init();
if (blas_server_avail == 0) {
blas_thread_init();
//deal with pthread and fork.
openblas_fork_handler();
}
#endif
#endif

View File

@ -40,10 +40,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S

View File

@ -45,10 +45,10 @@ DAXPYKERNEL = axpy_vfp.S
CAXPYKERNEL = axpy_vfp.S
ZAXPYKERNEL = axpy_vfp.S
SCOPYKERNEL = scopy_vfp.S
DCOPYKERNEL = dcopy_vfp.S
CCOPYKERNEL = ccopy_vfp.S
ZCOPYKERNEL = zcopy_vfp.S
SCOPYKERNEL = copy.c
DCOPYKERNEL = copy.c
CCOPYKERNEL = zcopy.c
ZCOPYKERNEL = zcopy.c
SDOTKERNEL = sdot_vfp.S
DDOTKERNEL = ddot_vfp.S
@ -66,12 +66,12 @@ CROTKERNEL = rot_vfp.S
ZROTKERNEL = rot_vfp.S
SSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal_vfp.S
DSCALKERNEL = scal.c
CSCALKERNEL = scal_vfp.S
ZSCALKERNEL = scal_vfp.S
SGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n_vfp.S
DGEMVNKERNEL = gemv_n.c
CGEMVNKERNEL = cgemv_n_vfp.S
ZGEMVNKERNEL = zgemv_n_vfp.S

View File

@ -111,6 +111,9 @@
#define MM M
#endif
#define TMP_M %r15
#define Y2 %rbx
PROLOGUE
PROFCODE
@ -170,8 +173,9 @@
jge .L00t
movq MMM,M
addq I,M
addq M, I
jle .L999x
movq I, M
.L00t:
movq XX,X
@ -2463,21 +2467,23 @@
cmpq Y, BUFFER
je .L999
#endif
movq M, TMP_M
movq Y, Y1
cmpq $SIZE, INCY
jne .L950
testq $SIZE, Y
testq $SIZE, Y1
je .L910
movsd (Y), %xmm0
movsd (Y1), %xmm0
addsd (BUFFER), %xmm0
movsd %xmm0, (Y)
movsd %xmm0, (Y1)
addq $SIZE, Y
addq $SIZE, Y1
addq $SIZE, BUFFER
decq M
decq TMP_M
jle .L999
ALIGN_4
@ -2485,20 +2491,20 @@
testq $SIZE, BUFFER
jne .L920
movq M, %rax
movq TMP_M, %rax
sarq $3, %rax
jle .L914
ALIGN_3
.L912:
#ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
#endif
movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 4 * SIZE(Y), %xmm2
movapd 6 * SIZE(Y), %xmm3
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1
movapd 4 * SIZE(Y1), %xmm2
movapd 6 * SIZE(Y1), %xmm3
movapd 0 * SIZE(BUFFER), %xmm4
movapd 2 * SIZE(BUFFER), %xmm5
@ -2514,12 +2520,12 @@
addpd %xmm6, %xmm2
addpd %xmm7, %xmm3
movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm2, 4 * SIZE(Y)
movapd %xmm3, 6 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm2, 4 * SIZE(Y1)
movapd %xmm3, 6 * SIZE(Y1)
addq $8 * SIZE, Y
addq $8 * SIZE, Y1
addq $8 * SIZE, BUFFER
decq %rax
@ -2527,14 +2533,14 @@
ALIGN_3
.L914:
testq $7, M
testq $7, TMP_M
jle .L999
testq $4, M
testq $4, TMP_M
jle .L915
movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1
movapd 0 * SIZE(BUFFER), %xmm4
movapd 2 * SIZE(BUFFER), %xmm5
@ -2542,40 +2548,40 @@
addpd %xmm4, %xmm0
addpd %xmm5, %xmm1
movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)
addq $4 * SIZE, Y
addq $4 * SIZE, Y1
addq $4 * SIZE, BUFFER
ALIGN_3
.L915:
testq $2, M
testq $2, TMP_M
jle .L916
movapd (Y), %xmm0
movapd (Y1), %xmm0
movapd (BUFFER), %xmm4
addpd %xmm4, %xmm0
movapd %xmm0, (Y)
movapd %xmm0, (Y1)
addq $2 * SIZE, Y
addq $2 * SIZE, Y1
addq $2 * SIZE, BUFFER
ALIGN_3
.L916:
testq $1, M
testq $1, TMP_M
jle .L999
movsd (Y), %xmm0
movsd (Y1), %xmm0
movsd 0 * SIZE(BUFFER), %xmm4
addsd %xmm4, %xmm0
movlpd %xmm0, (Y)
movlpd %xmm0, (Y1)
ALIGN_3
jmp .L999
@ -2584,20 +2590,20 @@
.L920:
movapd -1 * SIZE(BUFFER), %xmm4
movq M, %rax
movq TMP_M, %rax
sarq $3, %rax
jle .L924
ALIGN_3
.L922:
#ifdef PREFETCHW
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
#endif
movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 4 * SIZE(Y), %xmm2
movapd 6 * SIZE(Y), %xmm3
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1
movapd 4 * SIZE(Y1), %xmm2
movapd 6 * SIZE(Y1), %xmm3
movapd 1 * SIZE(BUFFER), %xmm5
movapd 3 * SIZE(BUFFER), %xmm6
@ -2618,14 +2624,14 @@
addpd %xmm6, %xmm2
addpd %xmm7, %xmm3
movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm2, 4 * SIZE(Y)
movapd %xmm3, 6 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm2, 4 * SIZE(Y1)
movapd %xmm3, 6 * SIZE(Y1)
movapd %xmm8, %xmm4
addq $8 * SIZE, Y
addq $8 * SIZE, Y1
addq $8 * SIZE, BUFFER
decq %rax
@ -2633,14 +2639,14 @@
ALIGN_3
.L924:
testq $7, M
testq $7, TMP_M
jle .L999
testq $4, M
testq $4, TMP_M
jle .L925
movapd 0 * SIZE(Y), %xmm0
movapd 2 * SIZE(Y), %xmm1
movapd 0 * SIZE(Y1), %xmm0
movapd 2 * SIZE(Y1), %xmm1
movapd 1 * SIZE(BUFFER), %xmm5
movapd 3 * SIZE(BUFFER), %xmm6
@ -2651,20 +2657,20 @@
addpd %xmm4, %xmm0
addpd %xmm5, %xmm1
movapd %xmm0, 0 * SIZE(Y)
movapd %xmm1, 2 * SIZE(Y)
movapd %xmm0, 0 * SIZE(Y1)
movapd %xmm1, 2 * SIZE(Y1)
movapd %xmm6, %xmm4
addq $4 * SIZE, Y
addq $4 * SIZE, Y1
addq $4 * SIZE, BUFFER
ALIGN_3
.L925:
testq $2, M
testq $2, TMP_M
jle .L926
movapd (Y), %xmm0
movapd (Y1), %xmm0
movapd 1 * SIZE(BUFFER), %xmm5
@ -2672,25 +2678,25 @@
addpd %xmm4, %xmm0
movapd %xmm0, (Y)
movapd %xmm0, (Y1)
movaps %xmm5, %xmm4
addq $2 * SIZE, Y
addq $2 * SIZE, Y1
addq $2 * SIZE, BUFFER
ALIGN_3
.L926:
testq $1, M
testq $1, TMP_M
jle .L999
movsd (Y), %xmm0
movsd (Y1), %xmm0
shufpd $1, %xmm4, %xmm4
addsd %xmm4, %xmm0
movlpd %xmm0, (Y)
movlpd %xmm0, (Y1)
ALIGN_3
jmp .L999
@ -2700,53 +2706,53 @@
testq $SIZE, BUFFER
je .L960
movsd (Y), %xmm0
movsd (Y1), %xmm0
addsd (BUFFER), %xmm0
movsd %xmm0, (Y)
movsd %xmm0, (Y1)
addq INCY, Y
addq INCY, Y1
addq $SIZE, BUFFER
decq M
decq TMP_M
jle .L999
ALIGN_4
.L960:
movq Y, Y1
movq Y1, Y2
movq M, %rax
movq TMP_M, %rax
sarq $3, %rax
jle .L964
ALIGN_3
.L962:
movsd (Y), %xmm0
addq INCY, Y
movhpd (Y), %xmm0
addq INCY, Y
movsd (Y2), %xmm0
addq INCY, Y2
movhpd (Y2), %xmm0
addq INCY, Y2
movapd 0 * SIZE(BUFFER), %xmm4
movsd (Y), %xmm1
addq INCY, Y
movhpd (Y), %xmm1
addq INCY, Y
movsd (Y2), %xmm1
addq INCY, Y2
movhpd (Y2), %xmm1
addq INCY, Y2
movapd 2 * SIZE(BUFFER), %xmm5
movsd (Y), %xmm2
addq INCY, Y
movhpd (Y), %xmm2
addq INCY, Y
movsd (Y2), %xmm2
addq INCY, Y2
movhpd (Y2), %xmm2
addq INCY, Y2
movapd 4 * SIZE(BUFFER), %xmm6
addpd %xmm4, %xmm0
movsd (Y), %xmm3
addq INCY, Y
movhpd (Y), %xmm3
addq INCY, Y
movsd (Y2), %xmm3
addq INCY, Y2
movhpd (Y2), %xmm3
addq INCY, Y2
movapd 6 * SIZE(BUFFER), %xmm7
@ -2781,23 +2787,23 @@
ALIGN_3
.L964:
testq $7, M
testq $7, TMP_M
jle .L999
testq $4, M
testq $4, TMP_M
jle .L965
movsd (Y), %xmm0
addq INCY, Y
movhpd (Y), %xmm0
addq INCY, Y
movsd (Y2), %xmm0
addq INCY, Y2
movhpd (Y2), %xmm0
addq INCY, Y2
movapd 0 * SIZE(BUFFER), %xmm4
movsd (Y), %xmm1
addq INCY, Y
movhpd (Y), %xmm1
addq INCY, Y
movsd (Y2), %xmm1
addq INCY, Y2
movhpd (Y2), %xmm1
addq INCY, Y2
movapd 2 * SIZE(BUFFER), %xmm5
@ -2817,13 +2823,13 @@
ALIGN_3
.L965:
testq $2, M
testq $2, TMP_M
jle .L966
movsd (Y), %xmm0
addq INCY, Y
movhpd (Y), %xmm0
addq INCY, Y
movsd (Y2), %xmm0
addq INCY, Y2
movhpd (Y2), %xmm0
addq INCY, Y2
movapd 0 * SIZE(BUFFER), %xmm4
@ -2838,10 +2844,10 @@
ALIGN_3
.L966:
testq $1, M
testq $1, TMP_M
jle .L999
movsd (Y), %xmm0
movsd (Y2), %xmm0
movsd 0 * SIZE(BUFFER), %xmm4
@ -2853,6 +2859,9 @@
.L999:
leaq (, M, SIZE), %rax
addq %rax,AA
movq STACK_INCY, INCY
imulq INCY, %rax
addq %rax, Y
jmp .L0t
ALIGN_4