Files
OpenBLAS/driver/others/blas_server_win32.c
Jehan 13226e3101 driver: more reasonable thread wait timeout on Windows.
It used to be 5ms, which might not be long enough in some cases for the
thread to exit well, but then when set to 5000 (5s), it would slow down
any program depending on OpenBlas.

Let's just set it to 50ms, which is at least 10 times longer than
originally, but still reasonable in case of failed thread termination.
2019-12-13 09:52:33 +01:00

536 lines
14 KiB
C

/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
/* */
/* Redistribution and use in source and binary forms, with or */
/* without modification, are permitted provided that the following */
/* conditions are met: */
/* */
/* 1. Redistributions of source code must retain the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer. */
/* */
/* 2. Redistributions in binary form must reproduce the above */
/* copyright notice, this list of conditions and the following */
/* disclaimer in the documentation and/or other materials */
/* provided with the distribution. */
/* */
/* THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF TEXAS AT */
/* AUSTIN ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, */
/* INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF */
/* MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE */
/* DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OF TEXAS AT */
/* AUSTIN OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, */
/* INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES */
/* (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE */
/* GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR */
/* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF */
/* LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT */
/* (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT */
/* OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE */
/* POSSIBILITY OF SUCH DAMAGE. */
/* */
/* The views and conclusions contained in the software and */
/* documentation are those of the authors and should not be */
/* interpreted as representing official policies, either expressed */
/* or implied, of The University of Texas at Austin. */
/*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include "common.h"
#if defined(OS_CYGWIN_NT) && !defined(unlikely)
#ifdef __GNUC__
#define unlikely(x) __builtin_expect(!!(x), 0)
#else
#define unlikely(x) (x)
#endif
#endif
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common information */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
HANDLE killed;
blas_queue_t *queue; /* Parameter Pointer */
int shutdown; /* server shutdown flag */
} blas_pool_t;
/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
/* Local Variables */
static BLASULONG server_lock = 0;
static blas_pool_t pool;
static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER];
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (!(mode & BLAS_COMPLEX)){
#ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){
/* REAL / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG,
xdouble *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((xdouble *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else
#endif
if (mode & BLAS_DOUBLE){
/* REAL / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double,
double *, BLASLONG, double *, BLASLONG,
double *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((double *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else {
/* REAL / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float,
float *, BLASLONG, float *, BLASLONG,
float *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((float *)args -> alpha)[0],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
}
} else {
#ifdef EXPRECISION
if (mode & BLAS_XDOUBLE){
/* COMPLEX / Extended Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
xdouble *, BLASLONG, xdouble *, BLASLONG,
xdouble *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((xdouble *)args -> alpha)[0],
((xdouble *)args -> alpha)[1],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else
#endif
if (mode & BLAS_DOUBLE){
/* COMPLEX / Double */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
double *, BLASLONG, double *, BLASLONG,
double *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((double *)args -> alpha)[0],
((double *)args -> alpha)[1],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
} else {
/* COMPLEX / Single */
void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
float *, BLASLONG, float *, BLASLONG,
float *, BLASLONG, void *) = func;
afunc(args -> m, args -> n, args -> k,
((float *)args -> alpha)[0],
((float *)args -> alpha)[1],
args -> a, args -> lda,
args -> b, args -> ldb,
args -> c, args -> ldc, sb);
}
}
}
/* This is a main routine of threads. Each thread waits until job is */
/* queued. */
static DWORD WINAPI blas_thread_server(void *arg){
/* Thread identifier */
#ifdef SMP_DEBUG
BLASLONG cpu = (BLASLONG)arg;
#endif
void *buffer, *sa, *sb;
blas_queue_t *queue;
DWORD action;
HANDLE handles[] = {pool.filled, pool.killed};
/* Each server needs each buffer */
buffer = blas_memory_alloc(2);
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
#endif
while (1){
/* Waiting for Queue */
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
#endif
do {
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
} while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
if (action == WAIT_OBJECT_0 + 1) break;
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
#endif
EnterCriticalSection(&pool.lock);
queue = pool.queue;
if (queue) pool.queue = queue->next;
LeaveCriticalSection(&pool.lock);
if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
if (pool.queue) SetEvent(pool.filled);
sa = queue -> sa;
sb = queue -> sb;
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
#endif
// fprintf(stderr, "queue start[%ld]!!!\n", cpu);
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING1;
#endif
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){
#ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else
#endif
if (queue -> mode & BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else {
sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
} else {
#ifdef EXPRECISION
if (queue -> mode & BLAS_XDOUBLE){
sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else
#endif
if (queue -> mode & BLAS_DOUBLE){
sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
} else {
sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
}
}
queue->sb=sb;
}
#ifdef MONITOR
main_status[cpu] = MAIN_RUNNING2;
#endif
if (!(queue -> mode & BLAS_LEGACY)) {
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
} else {
legacy_exec(routine, queue -> mode, queue -> args, sb);
}
}else{
continue; //if queue == NULL
}
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
#endif
EnterCriticalSection(&queue->lock);
queue -> status = BLAS_STATUS_FINISHED;
LeaveCriticalSection(&queue->lock);
SetEvent(queue->finish);
}
/* Shutdown procedure */
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
#endif
blas_memory_free(buffer);
return 0;
}
/* Initializing routine */
int blas_thread_init(void){
BLASLONG i;
if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
LOCK_COMMAND(&server_lock);
#ifdef SMP_DEBUG
fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
blas_cpu_number);
#endif
if (!blas_server_avail){
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
pool.shutdown = 0;
pool.queue = NULL;
for(i = 0; i < blas_cpu_number - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_server_avail = 1;
}
UNLOCK_COMMAND(&server_lock);
return 0;
}
/*
User can call one of two routines.
exec_blas_async ... immediately returns after jobs are queued.
exec_blas ... returns after jobs are finished.
*/
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
blas_queue_t *current;
current = queue;
while (current) {
InitializeCriticalSection(&current -> lock);
current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
current -> position = pos;
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("fnstcw %0" : "=m" (current -> x87_mode));
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
#endif
current = current -> next;
pos ++;
}
EnterCriticalSection(&pool.lock);
if (pool.queue) {
current = pool.queue;
while (current -> next) current = current -> next;
current -> next = queue;
} else {
pool.queue = queue;
}
LeaveCriticalSection(&pool.lock);
SetEvent(pool.filled);
return 0;
}
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
#ifdef SMP_DEBUG
fprintf(STDERR, "Synchronization Waiting.\n");
#endif
while (num){
#ifdef SMP_DEBUG
fprintf(STDERR, "Waiting Queue ..\n");
#endif
WaitForSingleObject(queue->finish, INFINITE);
CloseHandle(queue->finish);
DeleteCriticalSection(&queue -> lock);
queue = queue -> next;
num --;
}
#ifdef SMP_DEBUG
fprintf(STDERR, "Completely Done.\n\n");
#endif
return 0;
}
/* Execute Threads */
int exec_blas(BLASLONG num, blas_queue_t *queue){
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
#ifndef ALL_THREADED
int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
#endif
if ((num <= 0) || (queue == NULL)) return 0;
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
routine = queue -> routine;
if (!(queue -> mode & BLAS_LEGACY)) {
(routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0);
} else {
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
}
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
return 0;
}
/* Shutdown procedure, but user don't have to call this routine. The */
/* kernel automatically kill threads. */
int BLASFUNC(blas_thread_shutdown)(void){
int i;
if (!blas_server_avail) return 0;
LOCK_COMMAND(&server_lock);
if (blas_server_avail){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
if (WAIT_OBJECT_0 != wait_thread_value) {
TerminateThread(blas_threads[i],0);
}
#endif
CloseHandle(blas_threads[i]);
}
CloseHandle(pool.filled);
CloseHandle(pool.killed);
blas_server_avail = 0;
}
UNLOCK_COMMAND(&server_lock);
return 0;
}
void goto_set_num_threads(int num_threads)
{
long i;
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_cpu_number;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
//increased_threads = 1;
if (!blas_server_avail){
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
pool.shutdown = 0;
pool.queue = NULL;
blas_server_avail = 1;
}
for(i = blas_num_threads - 1; i < num_threads - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_num_threads = num_threads;
UNLOCK_COMMAND(&server_lock);
}
blas_cpu_number = num_threads;
}
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}