Merge branch 'develop' of github.com:xianyi/OpenBLAS into develop
This commit is contained in:
commit
dd2d3e61ab
|
@ -83,6 +83,8 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|||
|
||||
#define ATTRIBUTE_SIZE 128
|
||||
|
||||
extern void openblas_warning(int verbose, const char * msg);
|
||||
|
||||
/* This is a thread server model implementation. The threads are */
|
||||
/* spawned at first access to blas library, and still remains until */
|
||||
/* destruction routine is called. The number of threads are */
|
||||
|
@ -921,5 +923,17 @@ int BLASFUNC(blas_thread_shutdown)(void){
|
|||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
https://github.com/xianyi/OpenBLAS/issues/294
|
||||
Use pthread_atfork to close blas_thread_server before fork.
|
||||
Then, re-init blas_thread_server after fork at child and parent.
|
||||
*/
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
int err;
|
||||
err = pthread_atfork (BLASFUNC(blas_thread_shutdown), blas_thread_init, blas_thread_init);
|
||||
if(err != 0)
|
||||
openblas_warning(0, "OpenBLAS cannot install fork handler. You may meet hang after fork.\n");
|
||||
}
|
||||
#endif
|
||||
|
||||
|
|
|
@ -315,4 +315,9 @@ int exec_blas(BLASLONG num, blas_queue_t *queue){
|
|||
return 0;
|
||||
}
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
|
|
|
@ -498,3 +498,8 @@ void openblas_set_num_threads(int num)
|
|||
{
|
||||
goto_set_num_threads(num);
|
||||
}
|
||||
|
||||
void openblas_fork_handler()
|
||||
{
|
||||
|
||||
}
|
||||
|
|
|
@ -1288,7 +1288,11 @@ void CONSTRUCTOR gotoblas_init(void) {
|
|||
#ifdef SMP
|
||||
if (blas_cpu_number == 0) blas_get_cpu_number();
|
||||
#ifdef SMP_SERVER
|
||||
if (blas_server_avail == 0) blas_thread_init();
|
||||
if (blas_server_avail == 0) {
|
||||
blas_thread_init();
|
||||
//deal with pthread and fork.
|
||||
openblas_fork_handler();
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
|
|
|
@ -40,10 +40,10 @@ DAXPYKERNEL = axpy_vfp.S
|
|||
CAXPYKERNEL = axpy_vfp.S
|
||||
ZAXPYKERNEL = axpy_vfp.S
|
||||
|
||||
SCOPYKERNEL = scopy_vfp.S
|
||||
DCOPYKERNEL = dcopy_vfp.S
|
||||
CCOPYKERNEL = ccopy_vfp.S
|
||||
ZCOPYKERNEL = zcopy_vfp.S
|
||||
SCOPYKERNEL = copy.c
|
||||
DCOPYKERNEL = copy.c
|
||||
CCOPYKERNEL = zcopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
|
||||
SDOTKERNEL = sdot_vfp.S
|
||||
DDOTKERNEL = ddot_vfp.S
|
||||
|
|
|
@ -45,10 +45,10 @@ DAXPYKERNEL = axpy_vfp.S
|
|||
CAXPYKERNEL = axpy_vfp.S
|
||||
ZAXPYKERNEL = axpy_vfp.S
|
||||
|
||||
SCOPYKERNEL = scopy_vfp.S
|
||||
DCOPYKERNEL = dcopy_vfp.S
|
||||
CCOPYKERNEL = ccopy_vfp.S
|
||||
ZCOPYKERNEL = zcopy_vfp.S
|
||||
SCOPYKERNEL = copy.c
|
||||
DCOPYKERNEL = copy.c
|
||||
CCOPYKERNEL = zcopy.c
|
||||
ZCOPYKERNEL = zcopy.c
|
||||
|
||||
SDOTKERNEL = sdot_vfp.S
|
||||
DDOTKERNEL = ddot_vfp.S
|
||||
|
@ -66,12 +66,12 @@ CROTKERNEL = rot_vfp.S
|
|||
ZROTKERNEL = rot_vfp.S
|
||||
|
||||
SSCALKERNEL = scal_vfp.S
|
||||
DSCALKERNEL = scal_vfp.S
|
||||
DSCALKERNEL = scal.c
|
||||
CSCALKERNEL = scal_vfp.S
|
||||
ZSCALKERNEL = scal_vfp.S
|
||||
|
||||
SGEMVNKERNEL = gemv_n_vfp.S
|
||||
DGEMVNKERNEL = gemv_n_vfp.S
|
||||
DGEMVNKERNEL = gemv_n.c
|
||||
CGEMVNKERNEL = cgemv_n_vfp.S
|
||||
ZGEMVNKERNEL = zgemv_n_vfp.S
|
||||
|
||||
|
|
|
@ -111,6 +111,9 @@
|
|||
#define MM M
|
||||
#endif
|
||||
|
||||
#define TMP_M %r15
|
||||
#define Y2 %rbx
|
||||
|
||||
PROLOGUE
|
||||
PROFCODE
|
||||
|
||||
|
@ -170,8 +173,9 @@
|
|||
jge .L00t
|
||||
|
||||
movq MMM,M
|
||||
addq I,M
|
||||
addq M, I
|
||||
jle .L999x
|
||||
movq I, M
|
||||
|
||||
.L00t:
|
||||
movq XX,X
|
||||
|
@ -2463,21 +2467,23 @@
|
|||
cmpq Y, BUFFER
|
||||
je .L999
|
||||
#endif
|
||||
|
||||
movq M, TMP_M
|
||||
movq Y, Y1
|
||||
|
||||
cmpq $SIZE, INCY
|
||||
jne .L950
|
||||
|
||||
testq $SIZE, Y
|
||||
testq $SIZE, Y1
|
||||
je .L910
|
||||
|
||||
movsd (Y), %xmm0
|
||||
movsd (Y1), %xmm0
|
||||
addsd (BUFFER), %xmm0
|
||||
movsd %xmm0, (Y)
|
||||
movsd %xmm0, (Y1)
|
||||
|
||||
addq $SIZE, Y
|
||||
addq $SIZE, Y1
|
||||
addq $SIZE, BUFFER
|
||||
|
||||
decq M
|
||||
decq TMP_M
|
||||
jle .L999
|
||||
ALIGN_4
|
||||
|
||||
|
@ -2485,20 +2491,20 @@
|
|||
testq $SIZE, BUFFER
|
||||
jne .L920
|
||||
|
||||
movq M, %rax
|
||||
movq TMP_M, %rax
|
||||
sarq $3, %rax
|
||||
jle .L914
|
||||
ALIGN_3
|
||||
|
||||
.L912:
|
||||
#ifdef PREFETCHW
|
||||
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
|
||||
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
|
||||
#endif
|
||||
|
||||
movapd 0 * SIZE(Y), %xmm0
|
||||
movapd 2 * SIZE(Y), %xmm1
|
||||
movapd 4 * SIZE(Y), %xmm2
|
||||
movapd 6 * SIZE(Y), %xmm3
|
||||
movapd 0 * SIZE(Y1), %xmm0
|
||||
movapd 2 * SIZE(Y1), %xmm1
|
||||
movapd 4 * SIZE(Y1), %xmm2
|
||||
movapd 6 * SIZE(Y1), %xmm3
|
||||
|
||||
movapd 0 * SIZE(BUFFER), %xmm4
|
||||
movapd 2 * SIZE(BUFFER), %xmm5
|
||||
|
@ -2514,12 +2520,12 @@
|
|||
addpd %xmm6, %xmm2
|
||||
addpd %xmm7, %xmm3
|
||||
|
||||
movapd %xmm0, 0 * SIZE(Y)
|
||||
movapd %xmm1, 2 * SIZE(Y)
|
||||
movapd %xmm2, 4 * SIZE(Y)
|
||||
movapd %xmm3, 6 * SIZE(Y)
|
||||
movapd %xmm0, 0 * SIZE(Y1)
|
||||
movapd %xmm1, 2 * SIZE(Y1)
|
||||
movapd %xmm2, 4 * SIZE(Y1)
|
||||
movapd %xmm3, 6 * SIZE(Y1)
|
||||
|
||||
addq $8 * SIZE, Y
|
||||
addq $8 * SIZE, Y1
|
||||
addq $8 * SIZE, BUFFER
|
||||
|
||||
decq %rax
|
||||
|
@ -2527,14 +2533,14 @@
|
|||
ALIGN_3
|
||||
|
||||
.L914:
|
||||
testq $7, M
|
||||
testq $7, TMP_M
|
||||
jle .L999
|
||||
|
||||
testq $4, M
|
||||
testq $4, TMP_M
|
||||
jle .L915
|
||||
|
||||
movapd 0 * SIZE(Y), %xmm0
|
||||
movapd 2 * SIZE(Y), %xmm1
|
||||
movapd 0 * SIZE(Y1), %xmm0
|
||||
movapd 2 * SIZE(Y1), %xmm1
|
||||
|
||||
movapd 0 * SIZE(BUFFER), %xmm4
|
||||
movapd 2 * SIZE(BUFFER), %xmm5
|
||||
|
@ -2542,40 +2548,40 @@
|
|||
addpd %xmm4, %xmm0
|
||||
addpd %xmm5, %xmm1
|
||||
|
||||
movapd %xmm0, 0 * SIZE(Y)
|
||||
movapd %xmm1, 2 * SIZE(Y)
|
||||
movapd %xmm0, 0 * SIZE(Y1)
|
||||
movapd %xmm1, 2 * SIZE(Y1)
|
||||
|
||||
addq $4 * SIZE, Y
|
||||
addq $4 * SIZE, Y1
|
||||
addq $4 * SIZE, BUFFER
|
||||
ALIGN_3
|
||||
|
||||
.L915:
|
||||
testq $2, M
|
||||
testq $2, TMP_M
|
||||
jle .L916
|
||||
|
||||
movapd (Y), %xmm0
|
||||
movapd (Y1), %xmm0
|
||||
|
||||
movapd (BUFFER), %xmm4
|
||||
|
||||
addpd %xmm4, %xmm0
|
||||
|
||||
movapd %xmm0, (Y)
|
||||
movapd %xmm0, (Y1)
|
||||
|
||||
addq $2 * SIZE, Y
|
||||
addq $2 * SIZE, Y1
|
||||
addq $2 * SIZE, BUFFER
|
||||
ALIGN_3
|
||||
|
||||
.L916:
|
||||
testq $1, M
|
||||
testq $1, TMP_M
|
||||
jle .L999
|
||||
|
||||
movsd (Y), %xmm0
|
||||
movsd (Y1), %xmm0
|
||||
|
||||
movsd 0 * SIZE(BUFFER), %xmm4
|
||||
|
||||
addsd %xmm4, %xmm0
|
||||
|
||||
movlpd %xmm0, (Y)
|
||||
movlpd %xmm0, (Y1)
|
||||
ALIGN_3
|
||||
|
||||
jmp .L999
|
||||
|
@ -2584,20 +2590,20 @@
|
|||
.L920:
|
||||
movapd -1 * SIZE(BUFFER), %xmm4
|
||||
|
||||
movq M, %rax
|
||||
movq TMP_M, %rax
|
||||
sarq $3, %rax
|
||||
jle .L924
|
||||
ALIGN_3
|
||||
|
||||
.L922:
|
||||
#ifdef PREFETCHW
|
||||
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y)
|
||||
PREFETCHW (PREFETCHSIZE) * 4 + PREOFFSET(Y1)
|
||||
#endif
|
||||
|
||||
movapd 0 * SIZE(Y), %xmm0
|
||||
movapd 2 * SIZE(Y), %xmm1
|
||||
movapd 4 * SIZE(Y), %xmm2
|
||||
movapd 6 * SIZE(Y), %xmm3
|
||||
movapd 0 * SIZE(Y1), %xmm0
|
||||
movapd 2 * SIZE(Y1), %xmm1
|
||||
movapd 4 * SIZE(Y1), %xmm2
|
||||
movapd 6 * SIZE(Y1), %xmm3
|
||||
|
||||
movapd 1 * SIZE(BUFFER), %xmm5
|
||||
movapd 3 * SIZE(BUFFER), %xmm6
|
||||
|
@ -2618,14 +2624,14 @@
|
|||
addpd %xmm6, %xmm2
|
||||
addpd %xmm7, %xmm3
|
||||
|
||||
movapd %xmm0, 0 * SIZE(Y)
|
||||
movapd %xmm1, 2 * SIZE(Y)
|
||||
movapd %xmm2, 4 * SIZE(Y)
|
||||
movapd %xmm3, 6 * SIZE(Y)
|
||||
movapd %xmm0, 0 * SIZE(Y1)
|
||||
movapd %xmm1, 2 * SIZE(Y1)
|
||||
movapd %xmm2, 4 * SIZE(Y1)
|
||||
movapd %xmm3, 6 * SIZE(Y1)
|
||||
|
||||
movapd %xmm8, %xmm4
|
||||
|
||||
addq $8 * SIZE, Y
|
||||
addq $8 * SIZE, Y1
|
||||
addq $8 * SIZE, BUFFER
|
||||
|
||||
decq %rax
|
||||
|
@ -2633,14 +2639,14 @@
|
|||
ALIGN_3
|
||||
|
||||
.L924:
|
||||
testq $7, M
|
||||
testq $7, TMP_M
|
||||
jle .L999
|
||||
|
||||
testq $4, M
|
||||
testq $4, TMP_M
|
||||
jle .L925
|
||||
|
||||
movapd 0 * SIZE(Y), %xmm0
|
||||
movapd 2 * SIZE(Y), %xmm1
|
||||
movapd 0 * SIZE(Y1), %xmm0
|
||||
movapd 2 * SIZE(Y1), %xmm1
|
||||
|
||||
movapd 1 * SIZE(BUFFER), %xmm5
|
||||
movapd 3 * SIZE(BUFFER), %xmm6
|
||||
|
@ -2651,20 +2657,20 @@
|
|||
addpd %xmm4, %xmm0
|
||||
addpd %xmm5, %xmm1
|
||||
|
||||
movapd %xmm0, 0 * SIZE(Y)
|
||||
movapd %xmm1, 2 * SIZE(Y)
|
||||
movapd %xmm0, 0 * SIZE(Y1)
|
||||
movapd %xmm1, 2 * SIZE(Y1)
|
||||
|
||||
movapd %xmm6, %xmm4
|
||||
|
||||
addq $4 * SIZE, Y
|
||||
addq $4 * SIZE, Y1
|
||||
addq $4 * SIZE, BUFFER
|
||||
ALIGN_3
|
||||
|
||||
.L925:
|
||||
testq $2, M
|
||||
testq $2, TMP_M
|
||||
jle .L926
|
||||
|
||||
movapd (Y), %xmm0
|
||||
movapd (Y1), %xmm0
|
||||
|
||||
movapd 1 * SIZE(BUFFER), %xmm5
|
||||
|
||||
|
@ -2672,25 +2678,25 @@
|
|||
|
||||
addpd %xmm4, %xmm0
|
||||
|
||||
movapd %xmm0, (Y)
|
||||
movapd %xmm0, (Y1)
|
||||
|
||||
movaps %xmm5, %xmm4
|
||||
|
||||
addq $2 * SIZE, Y
|
||||
addq $2 * SIZE, Y1
|
||||
addq $2 * SIZE, BUFFER
|
||||
ALIGN_3
|
||||
|
||||
.L926:
|
||||
testq $1, M
|
||||
testq $1, TMP_M
|
||||
jle .L999
|
||||
|
||||
movsd (Y), %xmm0
|
||||
movsd (Y1), %xmm0
|
||||
|
||||
shufpd $1, %xmm4, %xmm4
|
||||
|
||||
addsd %xmm4, %xmm0
|
||||
|
||||
movlpd %xmm0, (Y)
|
||||
movlpd %xmm0, (Y1)
|
||||
ALIGN_3
|
||||
|
||||
jmp .L999
|
||||
|
@ -2700,53 +2706,53 @@
|
|||
testq $SIZE, BUFFER
|
||||
je .L960
|
||||
|
||||
movsd (Y), %xmm0
|
||||
movsd (Y1), %xmm0
|
||||
addsd (BUFFER), %xmm0
|
||||
movsd %xmm0, (Y)
|
||||
movsd %xmm0, (Y1)
|
||||
|
||||
addq INCY, Y
|
||||
addq INCY, Y1
|
||||
addq $SIZE, BUFFER
|
||||
|
||||
decq M
|
||||
decq TMP_M
|
||||
jle .L999
|
||||
ALIGN_4
|
||||
|
||||
.L960:
|
||||
movq Y, Y1
|
||||
movq Y1, Y2
|
||||
|
||||
movq M, %rax
|
||||
movq TMP_M, %rax
|
||||
sarq $3, %rax
|
||||
jle .L964
|
||||
ALIGN_3
|
||||
|
||||
.L962:
|
||||
movsd (Y), %xmm0
|
||||
addq INCY, Y
|
||||
movhpd (Y), %xmm0
|
||||
addq INCY, Y
|
||||
movsd (Y2), %xmm0
|
||||
addq INCY, Y2
|
||||
movhpd (Y2), %xmm0
|
||||
addq INCY, Y2
|
||||
|
||||
movapd 0 * SIZE(BUFFER), %xmm4
|
||||
|
||||
movsd (Y), %xmm1
|
||||
addq INCY, Y
|
||||
movhpd (Y), %xmm1
|
||||
addq INCY, Y
|
||||
movsd (Y2), %xmm1
|
||||
addq INCY, Y2
|
||||
movhpd (Y2), %xmm1
|
||||
addq INCY, Y2
|
||||
|
||||
movapd 2 * SIZE(BUFFER), %xmm5
|
||||
|
||||
movsd (Y), %xmm2
|
||||
addq INCY, Y
|
||||
movhpd (Y), %xmm2
|
||||
addq INCY, Y
|
||||
movsd (Y2), %xmm2
|
||||
addq INCY, Y2
|
||||
movhpd (Y2), %xmm2
|
||||
addq INCY, Y2
|
||||
|
||||
movapd 4 * SIZE(BUFFER), %xmm6
|
||||
|
||||
addpd %xmm4, %xmm0
|
||||
|
||||
movsd (Y), %xmm3
|
||||
addq INCY, Y
|
||||
movhpd (Y), %xmm3
|
||||
addq INCY, Y
|
||||
movsd (Y2), %xmm3
|
||||
addq INCY, Y2
|
||||
movhpd (Y2), %xmm3
|
||||
addq INCY, Y2
|
||||
|
||||
movapd 6 * SIZE(BUFFER), %xmm7
|
||||
|
||||
|
@ -2781,23 +2787,23 @@
|
|||
ALIGN_3
|
||||
|
||||
.L964:
|
||||
testq $7, M
|
||||
testq $7, TMP_M
|
||||
jle .L999
|
||||
|
||||
testq $4, M
|
||||
testq $4, TMP_M
|
||||
jle .L965
|
||||
|
||||
movsd (Y), %xmm0
|
||||
addq INCY, Y
|
||||
movhpd (Y), %xmm0
|
||||
addq INCY, Y
|
||||
movsd (Y2), %xmm0
|
||||
addq INCY, Y2
|
||||
movhpd (Y2), %xmm0
|
||||
addq INCY, Y2
|
||||
|
||||
movapd 0 * SIZE(BUFFER), %xmm4
|
||||
|
||||
movsd (Y), %xmm1
|
||||
addq INCY, Y
|
||||
movhpd (Y), %xmm1
|
||||
addq INCY, Y
|
||||
movsd (Y2), %xmm1
|
||||
addq INCY, Y2
|
||||
movhpd (Y2), %xmm1
|
||||
addq INCY, Y2
|
||||
|
||||
movapd 2 * SIZE(BUFFER), %xmm5
|
||||
|
||||
|
@ -2817,13 +2823,13 @@
|
|||
ALIGN_3
|
||||
|
||||
.L965:
|
||||
testq $2, M
|
||||
testq $2, TMP_M
|
||||
jle .L966
|
||||
|
||||
movsd (Y), %xmm0
|
||||
addq INCY, Y
|
||||
movhpd (Y), %xmm0
|
||||
addq INCY, Y
|
||||
movsd (Y2), %xmm0
|
||||
addq INCY, Y2
|
||||
movhpd (Y2), %xmm0
|
||||
addq INCY, Y2
|
||||
|
||||
movapd 0 * SIZE(BUFFER), %xmm4
|
||||
|
||||
|
@ -2838,10 +2844,10 @@
|
|||
ALIGN_3
|
||||
|
||||
.L966:
|
||||
testq $1, M
|
||||
testq $1, TMP_M
|
||||
jle .L999
|
||||
|
||||
movsd (Y), %xmm0
|
||||
movsd (Y2), %xmm0
|
||||
|
||||
movsd 0 * SIZE(BUFFER), %xmm4
|
||||
|
||||
|
@ -2853,6 +2859,9 @@
|
|||
.L999:
|
||||
leaq (, M, SIZE), %rax
|
||||
addq %rax,AA
|
||||
movq STACK_INCY, INCY
|
||||
imulq INCY, %rax
|
||||
addq %rax, Y
|
||||
jmp .L0t
|
||||
ALIGN_4
|
||||
|
||||
|
|
Loading…
Reference in New Issue