949 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			C
		
	
	
	
			
		
		
	
	
			949 lines
		
	
	
		
			24 KiB
		
	
	
	
		
			C
		
	
	
	
/*****************************************************************************
 | 
						|
Copyright (c) 2011, Lab of Parallel Software and Computational Science,ICSAS
 | 
						|
All rights reserved.
 | 
						|
 | 
						|
Redistribution and use in source and binary forms, with or without
 | 
						|
modification, are permitted provided that the following conditions are
 | 
						|
met:
 | 
						|
 | 
						|
   1. Redistributions of source code must retain the above copyright
 | 
						|
      notice, this list of conditions and the following disclaimer.
 | 
						|
 | 
						|
   2. Redistributions in binary form must reproduce the above copyright
 | 
						|
      notice, this list of conditions and the following disclaimer in
 | 
						|
      the documentation and/or other materials provided with the
 | 
						|
      distribution.
 | 
						|
   3. Neither the name of the ISCAS nor the names of its contributors may 
 | 
						|
      be used to endorse or promote products derived from this software 
 | 
						|
      without specific prior written permission.
 | 
						|
 | 
						|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 
 | 
						|
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 
 | 
						|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 
 | 
						|
ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE 
 | 
						|
LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 
 | 
						|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 
 | 
						|
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 
 | 
						|
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 
 | 
						|
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE 
 | 
						|
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 | 
						|
 | 
						|
**********************************************************************************/
 | 
						|
 | 
						|
/*********************************************************************/
 | 
						|
/* Copyright 2009, 2010 The University of Texas at Austin.           */
 | 
						|
/* All rights reserved.                                              */
 | 
						|
/*                                                                   */
 | 
						|
/* Redistribution and use in source and binary forms, with or        */
 | 
						|
/* without modification, are permitted provided that the following   */
 | 
						|
/* conditions are met:                                               */
 | 
						|
/*                                                                   */
 | 
						|
/*   1. Redistributions of source code must retain the above         */
 | 
						|
/*      copyright notice, this list of conditions and the following  */
 | 
						|
/*      disclaimer.                                                  */
 | 
						|
/*                                                                   */
 | 
						|
/*   2. Redistributions in binary form must reproduce the above      */
 | 
						|
/*      copyright notice, this list of conditions and the following  */
 | 
						|
/*      disclaimer in the documentation and/or other materials       */
 | 
						|
/*      provided with the distribution.                              */
 | 
						|
/*                                                                   */
 | 
						|
/*    THIS  SOFTWARE IS PROVIDED  BY THE  UNIVERSITY OF  TEXAS AT    */
 | 
						|
/*    AUSTIN  ``AS IS''  AND ANY  EXPRESS OR  IMPLIED WARRANTIES,    */
 | 
						|
/*    INCLUDING, BUT  NOT LIMITED  TO, THE IMPLIED  WARRANTIES OF    */
 | 
						|
/*    MERCHANTABILITY  AND FITNESS FOR  A PARTICULAR  PURPOSE ARE    */
 | 
						|
/*    DISCLAIMED.  IN  NO EVENT SHALL THE UNIVERSITY  OF TEXAS AT    */
 | 
						|
/*    AUSTIN OR CONTRIBUTORS BE  LIABLE FOR ANY DIRECT, INDIRECT,    */
 | 
						|
/*    INCIDENTAL,  SPECIAL, EXEMPLARY,  OR  CONSEQUENTIAL DAMAGES    */
 | 
						|
/*    (INCLUDING, BUT  NOT LIMITED TO,  PROCUREMENT OF SUBSTITUTE    */
 | 
						|
/*    GOODS  OR  SERVICES; LOSS  OF  USE,  DATA,  OR PROFITS;  OR    */
 | 
						|
/*    BUSINESS INTERRUPTION) HOWEVER CAUSED  AND ON ANY THEORY OF    */
 | 
						|
/*    LIABILITY, WHETHER  IN CONTRACT, STRICT  LIABILITY, OR TORT    */
 | 
						|
/*    (INCLUDING NEGLIGENCE OR OTHERWISE)  ARISING IN ANY WAY OUT    */
 | 
						|
/*    OF  THE  USE OF  THIS  SOFTWARE,  EVEN  IF ADVISED  OF  THE    */
 | 
						|
/*    POSSIBILITY OF SUCH DAMAGE.                                    */
 | 
						|
/*                                                                   */
 | 
						|
/* The views and conclusions contained in the software and           */
 | 
						|
/* documentation are those of the authors and should not be          */
 | 
						|
/* interpreted as representing official policies, either expressed   */
 | 
						|
/* or implied, of The University of Texas at Austin.                 */
 | 
						|
/*********************************************************************/
 | 
						|
 | 
						|
#include "common.h"
 | 
						|
#ifdef OS_LINUX
 | 
						|
#include <dlfcn.h>
 | 
						|
#include <sys/resource.h>
 | 
						|
#endif
 | 
						|
 | 
						|
#ifndef likely
 | 
						|
#ifdef __GNUC__
 | 
						|
#define likely(x) __builtin_expect(!!(x), 1)
 | 
						|
#else
 | 
						|
#define likely(x) (x)
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
#ifndef unlikely
 | 
						|
#ifdef __GNUC__
 | 
						|
#define unlikely(x) __builtin_expect(!!(x), 0)
 | 
						|
#else
 | 
						|
#define unlikely(x) (x)
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef SMP_SERVER
 | 
						|
 | 
						|
#undef MONITOR
 | 
						|
#undef TIMING
 | 
						|
#undef TIMING_DEBUG
 | 
						|
#undef NEED_STACKATTR
 | 
						|
 | 
						|
#define ATTRIBUTE_SIZE 128
 | 
						|
 | 
						|
/* This is a thread server model implementation.  The threads are   */
 | 
						|
/* spawned at first access to blas library, and still remains until */
 | 
						|
/* destruction routine is called.  The number of threads are        */
 | 
						|
/* equal to "OMP_NUM_THREADS - 1" and thread only wakes up when     */
 | 
						|
/* jobs is queued.                                                  */
 | 
						|
 | 
						|
/* We need this grobal for cheking if initialization is finished.   */
 | 
						|
int blas_server_avail   __attribute__((aligned(ATTRIBUTE_SIZE))) = 0;
 | 
						|
 | 
						|
/* Local Variables */
 | 
						|
#if   defined(USE_PTHREAD_LOCK)
 | 
						|
static pthread_mutex_t  server_lock    = PTHREAD_MUTEX_INITIALIZER;
 | 
						|
#elif defined(USE_PTHREAD_SPINLOCK)
 | 
						|
static pthread_spinlock_t  server_lock = 0;
 | 
						|
#else
 | 
						|
static unsigned long server_lock       = 0;
 | 
						|
#endif
 | 
						|
 | 
						|
#define THREAD_STATUS_SLEEP		2
 | 
						|
#define THREAD_STATUS_WAKEUP		4
 | 
						|
 | 
						|
static pthread_t       blas_threads [MAX_CPU_NUMBER];
 | 
						|
 | 
						|
typedef struct {
 | 
						|
  blas_queue_t * volatile queue   __attribute__((aligned(ATTRIBUTE_SIZE)));
 | 
						|
 | 
						|
#if defined(OS_LINUX) && !defined(NO_AFFINITY)
 | 
						|
  int	node;
 | 
						|
#endif
 | 
						|
 | 
						|
  volatile long		 status;
 | 
						|
 | 
						|
  pthread_mutex_t	 lock;
 | 
						|
  pthread_cond_t	 wakeup;
 | 
						|
 | 
						|
} thread_status_t;
 | 
						|
 | 
						|
static thread_status_t thread_status[MAX_CPU_NUMBER] __attribute__((aligned(ATTRIBUTE_SIZE)));
 | 
						|
 | 
						|
#ifndef THREAD_TIMEOUT
 | 
						|
#define THREAD_TIMEOUT	28
 | 
						|
#endif
 | 
						|
 | 
						|
static unsigned int thread_timeout = (1U << (THREAD_TIMEOUT));
 | 
						|
 | 
						|
#ifdef MONITOR
 | 
						|
 | 
						|
/* Monitor is a function to see thread's status for every seconds. */
 | 
						|
/* Usually it turns off and it's for debugging.                    */
 | 
						|
 | 
						|
static pthread_t      monitor_thread;
 | 
						|
static int main_status[MAX_CPU_NUMBER];
 | 
						|
#define MAIN_ENTER	 0x01
 | 
						|
#define MAIN_EXIT	 0x02
 | 
						|
#define MAIN_TRYLOCK	 0x03
 | 
						|
#define MAIN_LOCKSUCCESS 0x04
 | 
						|
#define MAIN_QUEUING	 0x05
 | 
						|
#define MAIN_RECEIVING   0x06
 | 
						|
#define MAIN_RUNNING1    0x07
 | 
						|
#define MAIN_RUNNING2    0x08
 | 
						|
#define MAIN_RUNNING3    0x09
 | 
						|
#define MAIN_WAITING	 0x0a
 | 
						|
#define MAIN_SLEEPING	 0x0b
 | 
						|
#define MAIN_FINISH      0x0c
 | 
						|
#define MAIN_DONE	 0x0d
 | 
						|
#endif
 | 
						|
 | 
						|
#define BLAS_QUEUE_FINISHED	3
 | 
						|
#define BLAS_QUEUE_RUNNING	4
 | 
						|
 | 
						|
#ifdef TIMING
 | 
						|
BLASLONG	exit_time[MAX_CPU_NUMBER];
 | 
						|
#endif
 | 
						|
 | 
						|
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
 | 
						|
 | 
						|
      if (!(mode & BLAS_COMPLEX)){
 | 
						|
#ifdef EXPRECISION
 | 
						|
	if (mode & BLAS_XDOUBLE){
 | 
						|
	  /* REAL / Extended Double */
 | 
						|
	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, 
 | 
						|
			xdouble *, BLASLONG, xdouble *, BLASLONG, 
 | 
						|
			xdouble *, BLASLONG, void *) = func;
 | 
						|
 | 
						|
	  afunc(args -> m, args -> n, args -> k,
 | 
						|
		((xdouble *)args -> alpha)[0],
 | 
						|
		args -> a, args -> lda,
 | 
						|
		args -> b, args -> ldb,
 | 
						|
		args -> c, args -> ldc, sb);
 | 
						|
	} else 
 | 
						|
#endif
 | 
						|
	  if (mode & BLAS_DOUBLE){
 | 
						|
	    /* REAL / Double */
 | 
						|
	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, 
 | 
						|
			  double *, BLASLONG, double *, BLASLONG, 
 | 
						|
			  double *, BLASLONG, void *) = func;
 | 
						|
	    
 | 
						|
	    afunc(args -> m, args -> n, args -> k,
 | 
						|
		  ((double *)args -> alpha)[0],
 | 
						|
		  args -> a, args -> lda,
 | 
						|
		  args -> b, args -> ldb,
 | 
						|
		  args -> c, args -> ldc, sb);
 | 
						|
	  } else {
 | 
						|
	    /* REAL / Single */
 | 
						|
	    void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, 
 | 
						|
			  float *, BLASLONG, float *, BLASLONG, 
 | 
						|
			  float *, BLASLONG, void *) = func;
 | 
						|
	    
 | 
						|
	    afunc(args -> m, args -> n, args -> k,
 | 
						|
		  ((float *)args -> alpha)[0],
 | 
						|
		  args -> a, args -> lda,
 | 
						|
		  args -> b, args -> ldb,
 | 
						|
		  args -> c, args -> ldc, sb);
 | 
						|
	  }
 | 
						|
      } else {
 | 
						|
#ifdef EXPRECISION
 | 
						|
	if (mode & BLAS_XDOUBLE){
 | 
						|
	  /* COMPLEX / Extended Double */
 | 
						|
	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, xdouble, xdouble,
 | 
						|
			xdouble *, BLASLONG, xdouble *, BLASLONG, 
 | 
						|
			xdouble *, BLASLONG, void *) = func;
 | 
						|
 | 
						|
	  afunc(args -> m, args -> n, args -> k,
 | 
						|
		((xdouble *)args -> alpha)[0],
 | 
						|
		((xdouble *)args -> alpha)[1],
 | 
						|
		args -> a, args -> lda,
 | 
						|
		args -> b, args -> ldb,
 | 
						|
		args -> c, args -> ldc, sb);
 | 
						|
	} else
 | 
						|
#endif
 | 
						|
	  if (mode & BLAS_DOUBLE){
 | 
						|
	    /* COMPLEX / Double */
 | 
						|
	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, double, double,
 | 
						|
			double *, BLASLONG, double *, BLASLONG, 
 | 
						|
			double *, BLASLONG, void *) = func;
 | 
						|
 | 
						|
	  afunc(args -> m, args -> n, args -> k,
 | 
						|
		((double *)args -> alpha)[0],
 | 
						|
		((double *)args -> alpha)[1],
 | 
						|
		args -> a, args -> lda,
 | 
						|
		args -> b, args -> ldb,
 | 
						|
		args -> c, args -> ldc, sb);
 | 
						|
	  } else {
 | 
						|
	    /* COMPLEX / Single */
 | 
						|
	  void (*afunc)(BLASLONG, BLASLONG, BLASLONG, float, float,
 | 
						|
			float *, BLASLONG, float *, BLASLONG, 
 | 
						|
			float *, BLASLONG, void *) = func;
 | 
						|
 | 
						|
	  afunc(args -> m, args -> n, args -> k,
 | 
						|
		((float *)args -> alpha)[0],
 | 
						|
		((float *)args -> alpha)[1],
 | 
						|
		args -> a, args -> lda,
 | 
						|
		args -> b, args -> ldb,
 | 
						|
		args -> c, args -> ldc, sb);
 | 
						|
	  }
 | 
						|
      }
 | 
						|
}
 | 
						|
 | 
						|
#if defined(OS_LINUX) && !defined(NO_AFFINITY)
 | 
						|
int gotoblas_set_affinity(int);
 | 
						|
int gotoblas_set_affinity2(int);
 | 
						|
int get_node(void);
 | 
						|
#endif
 | 
						|
 | 
						|
static int increased_threads = 0;
 | 
						|
 | 
						|
static int blas_thread_server(void *arg){
 | 
						|
 | 
						|
  /* Thread identifier */
 | 
						|
  BLASLONG  cpu = (BLASLONG)arg;
 | 
						|
  unsigned int last_tick;
 | 
						|
  void *buffer, *sa, *sb;
 | 
						|
  blas_queue_t	*queue;
 | 
						|
#ifdef TIMING_DEBUG
 | 
						|
  unsigned long start, stop;
 | 
						|
#endif
 | 
						|
  
 | 
						|
#if defined(OS_LINUX) && !defined(NO_AFFINITY)
 | 
						|
  if (!increased_threads)
 | 
						|
    thread_status[cpu].node = gotoblas_set_affinity(cpu + 1);
 | 
						|
  else  
 | 
						|
    thread_status[cpu].node = gotoblas_set_affinity(-1);
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef MONITOR
 | 
						|
  main_status[cpu] = MAIN_ENTER;
 | 
						|
#endif
 | 
						|
 | 
						|
  buffer = blas_memory_alloc(2);
 | 
						|
 | 
						|
#ifdef SMP_DEBUG
 | 
						|
  fprintf(STDERR, "Server[%2ld] Thread has just been spawned!\n", cpu);
 | 
						|
#endif
 | 
						|
  
 | 
						|
  while (1){
 | 
						|
 | 
						|
#ifdef MONITOR
 | 
						|
    main_status[cpu] = MAIN_QUEUING;
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef TIMING
 | 
						|
    exit_time[cpu] = rpcc();
 | 
						|
#endif
 | 
						|
 | 
						|
      last_tick = (unsigned int)rpcc();
 | 
						|
      
 | 
						|
      while (!thread_status[cpu].queue) {
 | 
						|
	
 | 
						|
	YIELDING;
 | 
						|
 | 
						|
	if ((unsigned int)rpcc() - last_tick > thread_timeout) {
 | 
						|
	  
 | 
						|
	  pthread_mutex_lock  (&thread_status[cpu].lock);
 | 
						|
	  
 | 
						|
	  if (!thread_status[cpu].queue) {
 | 
						|
	    thread_status[cpu].status = THREAD_STATUS_SLEEP;
 | 
						|
	    while (thread_status[cpu].status == THREAD_STATUS_SLEEP) {
 | 
						|
	      
 | 
						|
#ifdef MONITOR
 | 
						|
	      main_status[cpu] = MAIN_SLEEPING;
 | 
						|
#endif
 | 
						|
	      
 | 
						|
	      pthread_cond_wait(&thread_status[cpu].wakeup, &thread_status[cpu].lock);
 | 
						|
	    }
 | 
						|
	  }
 | 
						|
	  
 | 
						|
	  pthread_mutex_unlock(&thread_status[cpu].lock);
 | 
						|
	  
 | 
						|
	  last_tick = (unsigned int)rpcc();
 | 
						|
	}
 | 
						|
	
 | 
						|
      }
 | 
						|
      
 | 
						|
    queue = thread_status[cpu].queue;
 | 
						|
 | 
						|
    if ((long)queue == -1) break;
 | 
						|
 | 
						|
#ifdef MONITOR
 | 
						|
    main_status[cpu] = MAIN_RECEIVING;
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef TIMING_DEBUG
 | 
						|
    start = rpcc();
 | 
						|
#endif
 | 
						|
 | 
						|
    if (queue) {
 | 
						|
      int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
 | 
						|
	
 | 
						|
      thread_status[cpu].queue = (blas_queue_t *)1;
 | 
						|
 | 
						|
      sa = queue -> sa;
 | 
						|
      sb = queue -> sb;
 | 
						|
      
 | 
						|
#ifdef SMP_DEBUG
 | 
						|
      if (queue -> args) {
 | 
						|
	fprintf(STDERR, "Server[%2ld] Calculation started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
 | 
						|
		cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
 | 
						|
      }
 | 
						|
#endif
 | 
						|
      
 | 
						|
#ifdef CONSISTENT_FPCSR
 | 
						|
      __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
 | 
						|
      __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef MONITOR
 | 
						|
      main_status[cpu] = MAIN_RUNNING1;
 | 
						|
#endif
 | 
						|
      
 | 
						|
      if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
 | 
						|
 | 
						|
      if (sb == NULL) {
 | 
						|
	if (!(queue -> mode & BLAS_COMPLEX)){
 | 
						|
#ifdef EXPRECISION
 | 
						|
	  if (queue -> mode & BLAS_XDOUBLE){
 | 
						|
	    sb = (void *)(((BLASLONG)sa + ((QGEMM_P * QGEMM_Q * sizeof(xdouble) 
 | 
						|
					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
						|
	  } else 
 | 
						|
#endif
 | 
						|
	  if (queue -> mode & BLAS_DOUBLE){
 | 
						|
	    sb = (void *)(((BLASLONG)sa + ((DGEMM_P * DGEMM_Q * sizeof(double)
 | 
						|
					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
						|
	    
 | 
						|
	  } else {
 | 
						|
	    sb = (void *)(((BLASLONG)sa + ((SGEMM_P * SGEMM_Q * sizeof(float)
 | 
						|
					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
						|
	  }
 | 
						|
	} else {
 | 
						|
#ifdef EXPRECISION
 | 
						|
	  if (queue -> mode & BLAS_XDOUBLE){
 | 
						|
	    sb = (void *)(((BLASLONG)sa + ((XGEMM_P * XGEMM_Q * 2 * sizeof(xdouble)
 | 
						|
					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
						|
	  } else
 | 
						|
#endif
 | 
						|
	  if (queue -> mode & BLAS_DOUBLE){
 | 
						|
	    sb = (void *)(((BLASLONG)sa + ((ZGEMM_P * ZGEMM_Q * 2 * sizeof(double)
 | 
						|
					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
						|
	  } else {
 | 
						|
	    sb = (void *)(((BLASLONG)sa + ((CGEMM_P * CGEMM_Q * 2 * sizeof(float)
 | 
						|
					+ GEMM_ALIGN) & ~GEMM_ALIGN)) + GEMM_OFFSET_B);
 | 
						|
	  }
 | 
						|
	}
 | 
						|
	queue->sb=sb;
 | 
						|
      }
 | 
						|
	
 | 
						|
#ifdef MONITOR
 | 
						|
	main_status[cpu] = MAIN_RUNNING2;
 | 
						|
#endif
 | 
						|
 | 
						|
      if (queue -> mode & BLAS_LEGACY) {
 | 
						|
	legacy_exec(routine, queue -> mode, queue -> args, sb);
 | 
						|
      } else
 | 
						|
	if (queue -> mode & BLAS_PTHREAD) {
 | 
						|
	  void (*pthreadcompat)(void *) = queue -> routine;
 | 
						|
	  (pthreadcompat)(queue -> args);
 | 
						|
	} else
 | 
						|
	  (routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
 | 
						|
 | 
						|
#ifdef SMP_DEBUG
 | 
						|
      fprintf(STDERR, "Server[%2ld] Calculation finished!\n", cpu);
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef MONITOR
 | 
						|
      main_status[cpu] = MAIN_FINISH;
 | 
						|
#endif
 | 
						|
      
 | 
						|
      thread_status[cpu].queue = (blas_queue_t * volatile) ((long)thread_status[cpu].queue & 0);  /* Need a trick */
 | 
						|
      WMB;
 | 
						|
 | 
						|
    }
 | 
						|
    
 | 
						|
#ifdef MONITOR
 | 
						|
      main_status[cpu] = MAIN_DONE;
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef TIMING_DEBUG
 | 
						|
    stop = rpcc();
 | 
						|
    
 | 
						|
    fprintf(STDERR, "Thread[%ld] : %16lu %16lu (%8lu cycles)\n", cpu + 1,
 | 
						|
	    start, stop,
 | 
						|
	    stop - start);
 | 
						|
#endif
 | 
						|
    
 | 
						|
  }
 | 
						|
 | 
						|
  /* Shutdown procedure */
 | 
						|
 | 
						|
#ifdef SMP_DEBUG
 | 
						|
      fprintf(STDERR, "Server[%2ld] Shutdown!\n",  cpu);
 | 
						|
#endif
 | 
						|
 | 
						|
  blas_memory_free(buffer);
 | 
						|
 | 
						|
  //pthread_exit(NULL);
 | 
						|
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
#ifdef MONITOR
 | 
						|
 | 
						|
static BLASLONG num_suspend = 0;
 | 
						|
 | 
						|
static int blas_monitor(void *arg){
 | 
						|
  int i;
 | 
						|
 | 
						|
  while(1){
 | 
						|
    for (i = 0; i < blas_num_threads - 1; i++){
 | 
						|
      switch (main_status[i]) {
 | 
						|
      case MAIN_ENTER :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Entering.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_EXIT :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Exiting.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_TRYLOCK :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Trying lock operation.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_QUEUING :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Queuing.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_RECEIVING :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Receiving.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_RUNNING1 :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Running1.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_RUNNING2 :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Running2.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_RUNNING3 :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Running3.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_WAITING :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Waiting.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_SLEEPING :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Sleeping.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_FINISH :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Finishing.\n", i);
 | 
						|
	break;
 | 
						|
      case MAIN_DONE :
 | 
						|
	fprintf(STDERR, "THREAD[%2d] : Job is done.\n", i);
 | 
						|
	break;
 | 
						|
      }
 | 
						|
 | 
						|
      fprintf(stderr, "Total number of suspended ... %ld\n", num_suspend);
 | 
						|
    }
 | 
						|
    sleep(1);
 | 
						|
  }
 | 
						|
  
 | 
						|
 return 0;
 | 
						|
}
 | 
						|
#endif
 | 
						|
 | 
						|
/* Initializing routine */
 | 
						|
int blas_thread_init(void){
 | 
						|
  BLASLONG i;
 | 
						|
  int ret;
 | 
						|
#ifdef NEED_STACKATTR
 | 
						|
  pthread_attr_t attr;
 | 
						|
#endif
 | 
						|
 | 
						|
  if (blas_server_avail) return 0;
 | 
						|
  
 | 
						|
#ifdef NEED_STACKATTR
 | 
						|
  pthread_attr_init(&attr);
 | 
						|
  pthread_attr_setguardsize(&attr,  0x1000U);
 | 
						|
  pthread_attr_setstacksize( &attr, 0x1000U);
 | 
						|
#endif
 | 
						|
  
 | 
						|
  LOCK_COMMAND(&server_lock);
 | 
						|
 | 
						|
  if (!blas_server_avail){
 | 
						|
 | 
						|
    char *p;
 | 
						|
 | 
						|
    p = getenv("THREAD_TIMEOUT");
 | 
						|
 | 
						|
    if (p) {
 | 
						|
      thread_timeout = atoi(p);
 | 
						|
      if (thread_timeout <  4) thread_timeout =  4;
 | 
						|
      if (thread_timeout > 30) thread_timeout = 30;
 | 
						|
      thread_timeout = (1 << thread_timeout);
 | 
						|
    }else{
 | 
						|
		p = getenv("GOTO_THREAD_TIMEOUT");
 | 
						|
		if (p) {
 | 
						|
			thread_timeout = atoi(p);
 | 
						|
			if (thread_timeout <  4) thread_timeout =  4;
 | 
						|
			if (thread_timeout > 30) thread_timeout = 30;
 | 
						|
			thread_timeout = (1 << thread_timeout);
 | 
						|
		}
 | 
						|
	}
 | 
						|
	
 | 
						|
 | 
						|
    for(i = 0; i < blas_num_threads - 1; i++){
 | 
						|
 | 
						|
      thread_status[i].queue  = (blas_queue_t *)NULL;
 | 
						|
      thread_status[i].status = THREAD_STATUS_WAKEUP;
 | 
						|
      
 | 
						|
      pthread_mutex_init(&thread_status[i].lock, NULL);
 | 
						|
      pthread_cond_init (&thread_status[i].wakeup, NULL);
 | 
						|
      
 | 
						|
#ifdef NEED_STACKATTR
 | 
						|
      ret=pthread_create(&blas_threads[i], &attr, 
 | 
						|
		     (void *)&blas_thread_server, (void *)i);
 | 
						|
#else
 | 
						|
      ret=pthread_create(&blas_threads[i], NULL, 
 | 
						|
		     (void *)&blas_thread_server, (void *)i);
 | 
						|
#endif
 | 
						|
      if(ret!=0){
 | 
						|
	fprintf(STDERR,"OpenBLAS: pthread_creat error in blas_thread_init function. Error code:%d\n",ret);
 | 
						|
	exit(1);
 | 
						|
      }
 | 
						|
    }
 | 
						|
 | 
						|
#ifdef MONITOR
 | 
						|
    pthread_create(&monitor_thread, NULL, 
 | 
						|
		     (void *)&blas_monitor, (void *)NULL);
 | 
						|
#endif
 | 
						|
 | 
						|
    blas_server_avail = 1;
 | 
						|
  }
 | 
						|
 | 
						|
  UNLOCK_COMMAND(&server_lock);
 | 
						|
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
/* 
 | 
						|
   User can call one of two routines.
 | 
						|
 | 
						|
     exec_blas_async ... immediately returns after jobs are queued.
 | 
						|
 | 
						|
     exec_blas       ... returns after jobs are finished.
 | 
						|
*/
 | 
						|
 | 
						|
static BLASULONG exec_queue_lock = 0;
 | 
						|
 | 
						|
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
 | 
						|
 | 
						|
#ifdef SMP_SERVER
 | 
						|
  // Handle lazy re-init of the thread-pool after a POSIX fork
 | 
						|
  if (unlikely(blas_server_avail == 0)) blas_thread_init();
 | 
						|
#endif
 | 
						|
  BLASLONG i = 0;
 | 
						|
  blas_queue_t *current = queue;
 | 
						|
#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
 | 
						|
  int node  = get_node();
 | 
						|
  int nodes = get_num_nodes();
 | 
						|
#endif
 | 
						|
 | 
						|
#ifdef SMP_DEBUG
 | 
						|
  int exec_count = 0;
 | 
						|
  fprintf(STDERR, "Exec_blas_async is called. Position = %d\n", pos);
 | 
						|
#endif  
 | 
						|
  
 | 
						|
  blas_lock(&exec_queue_lock);
 | 
						|
 | 
						|
    while (queue) {
 | 
						|
      queue -> position  = pos;
 | 
						|
      
 | 
						|
#ifdef CONSISTENT_FPCSR
 | 
						|
      __asm__ __volatile__ ("fnstcw %0"  : "=m" (queue -> x87_mode));
 | 
						|
      __asm__ __volatile__ ("stmxcsr %0" : "=m" (queue -> sse_mode));
 | 
						|
#endif
 | 
						|
 | 
						|
#if defined(OS_LINUX) && !defined(NO_AFFINITY) && !defined(PARAMTEST)
 | 
						|
 | 
						|
      /* Node Mapping Mode */
 | 
						|
 | 
						|
      if (queue -> mode & BLAS_NODE) {
 | 
						|
 | 
						|
	do {
 | 
						|
	  while((thread_status[i].node != node || thread_status[i].queue) && (i < blas_num_threads - 1)) i ++;
 | 
						|
	  
 | 
						|
	  if (i < blas_num_threads - 1) break;
 | 
						|
 | 
						|
	  i ++;
 | 
						|
	  if (i >= blas_num_threads - 1) {
 | 
						|
	    i = 0;
 | 
						|
	    node ++;
 | 
						|
	    if (node >= nodes) node = 0;
 | 
						|
	  }
 | 
						|
 | 
						|
	} while (1);
 | 
						|
 | 
						|
      } else {
 | 
						|
	while(thread_status[i].queue) {
 | 
						|
	  i ++;
 | 
						|
	  if (i >= blas_num_threads - 1) i = 0;
 | 
						|
	}
 | 
						|
      }
 | 
						|
#else
 | 
						|
      while(thread_status[i].queue) {
 | 
						|
	i ++;
 | 
						|
	if (i >= blas_num_threads - 1) i = 0;
 | 
						|
      }
 | 
						|
#endif
 | 
						|
      
 | 
						|
      queue -> assigned = i;
 | 
						|
      WMB;
 | 
						|
      thread_status[i].queue = queue;
 | 
						|
      WMB;
 | 
						|
      
 | 
						|
      queue = queue -> next;
 | 
						|
      pos ++;
 | 
						|
#ifdef SMP_DEBUG
 | 
						|
      exec_count ++;
 | 
						|
#endif
 | 
						|
      
 | 
						|
    }
 | 
						|
 | 
						|
    blas_unlock(&exec_queue_lock);
 | 
						|
 | 
						|
#ifdef SMP_DEBUG
 | 
						|
    fprintf(STDERR, "Done(Number of threads = %2ld).\n", exec_count);
 | 
						|
#endif  
 | 
						|
    
 | 
						|
    while (current) {
 | 
						|
      
 | 
						|
      pos = current -> assigned;
 | 
						|
      
 | 
						|
      if ((BLASULONG)thread_status[pos].queue > 1) {
 | 
						|
	
 | 
						|
	if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
 | 
						|
	  
 | 
						|
	  pthread_mutex_lock  (&thread_status[pos].lock);
 | 
						|
	  
 | 
						|
#ifdef MONITOR
 | 
						|
	  num_suspend ++;
 | 
						|
#endif
 | 
						|
	  
 | 
						|
	  if (thread_status[pos].status == THREAD_STATUS_SLEEP) {
 | 
						|
	    thread_status[pos].status = THREAD_STATUS_WAKEUP;
 | 
						|
	    pthread_cond_signal(&thread_status[pos].wakeup);
 | 
						|
	  }
 | 
						|
	  pthread_mutex_unlock(&thread_status[pos].lock);
 | 
						|
	}
 | 
						|
      }
 | 
						|
      
 | 
						|
      current = current -> next;
 | 
						|
    }
 | 
						|
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
 | 
						|
 | 
						|
    while ((num > 0) && queue) {
 | 
						|
      
 | 
						|
      while(thread_status[queue -> assigned].queue) {
 | 
						|
	YIELDING;
 | 
						|
      };
 | 
						|
      
 | 
						|
      queue = queue -> next;
 | 
						|
      num --;
 | 
						|
    }
 | 
						|
 | 
						|
#ifdef SMP_DEBUG
 | 
						|
  fprintf(STDERR, "Done.\n\n");
 | 
						|
#endif
 | 
						|
  
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
/* Execute Threads */
 | 
						|
int exec_blas(BLASLONG num, blas_queue_t *queue){
 | 
						|
 | 
						|
#ifdef SMP_SERVER
 | 
						|
  // Handle lazy re-init of the thread-pool after a POSIX fork
 | 
						|
  if (unlikely(blas_server_avail == 0)) blas_thread_init();
 | 
						|
#endif
 | 
						|
  int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
 | 
						|
 | 
						|
#ifdef TIMING_DEBUG
 | 
						|
  BLASULONG start, stop;
 | 
						|
#endif
 | 
						|
 | 
						|
  if ((num <= 0) || (queue == NULL)) return 0;
 | 
						|
  
 | 
						|
#ifdef SMP_DEBUG
 | 
						|
  fprintf(STDERR, "Exec_blas is called. Number of executing threads : %ld\n", num);
 | 
						|
#endif  
 | 
						|
 | 
						|
#ifdef __ELF__
 | 
						|
  if (omp_in_parallel && (num > 1)) {
 | 
						|
    if (omp_in_parallel() > 0) {
 | 
						|
      fprintf(stderr, 
 | 
						|
	      "OpenBLAS Warning : Detect OpenMP Loop and this application may hang. "
 | 
						|
	      "Please rebuild the library with USE_OPENMP=1 option.\n");
 | 
						|
    }
 | 
						|
  }
 | 
						|
#endif
 | 
						|
  
 | 
						|
  if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
 | 
						|
 | 
						|
#ifdef TIMING_DEBUG
 | 
						|
  start = rpcc();
 | 
						|
  
 | 
						|
  fprintf(STDERR, "\n");
 | 
						|
#endif
 | 
						|
    
 | 
						|
  routine = queue -> routine;
 | 
						|
  
 | 
						|
  if (queue -> mode & BLAS_LEGACY) {
 | 
						|
    legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
 | 
						|
  } else
 | 
						|
    if (queue -> mode & BLAS_PTHREAD) {
 | 
						|
      void (*pthreadcompat)(void *) = queue -> routine;
 | 
						|
      (pthreadcompat)(queue -> args);
 | 
						|
    } else
 | 
						|
      (routine)(queue -> args, queue -> range_m, queue -> range_n,
 | 
						|
		queue -> sa, queue -> sb, 0);
 | 
						|
  
 | 
						|
#ifdef TIMING_DEBUG
 | 
						|
  stop = rpcc();
 | 
						|
#endif
 | 
						|
  
 | 
						|
  if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
 | 
						|
  
 | 
						|
#ifdef TIMING_DEBUG
 | 
						|
  fprintf(STDERR, "Thread[0] : %16lu %16lu (%8lu cycles)\n", 
 | 
						|
	  start, stop,
 | 
						|
	  stop - start);
 | 
						|
#endif
 | 
						|
  
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
void goto_set_num_threads(int num_threads) {
 | 
						|
 | 
						|
  long i;
 | 
						|
 | 
						|
  if (num_threads < 1) num_threads = blas_num_threads;
 | 
						|
 | 
						|
#ifndef NO_AFFINITY
 | 
						|
  if (num_threads == 1) {
 | 
						|
    if (blas_cpu_number == 1){
 | 
						|
      //OpenBLAS is already single thread.
 | 
						|
      return; 
 | 
						|
    }else{
 | 
						|
      //From multi-threads to single thread
 | 
						|
      //Restore the original affinity mask
 | 
						|
      gotoblas_set_affinity(-1);
 | 
						|
    }
 | 
						|
  }
 | 
						|
#endif
 | 
						|
 | 
						|
  if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
 | 
						|
 | 
						|
  if (num_threads > blas_num_threads) {
 | 
						|
 | 
						|
    LOCK_COMMAND(&server_lock);
 | 
						|
    
 | 
						|
    increased_threads = 1;
 | 
						|
 | 
						|
    for(i = blas_num_threads - 1; i < num_threads - 1; i++){
 | 
						|
      
 | 
						|
      thread_status[i].queue  = (blas_queue_t *)NULL;
 | 
						|
      thread_status[i].status = THREAD_STATUS_WAKEUP;
 | 
						|
      
 | 
						|
      pthread_mutex_init(&thread_status[i].lock, NULL);
 | 
						|
      pthread_cond_init (&thread_status[i].wakeup, NULL);
 | 
						|
      
 | 
						|
#ifdef NEED_STACKATTR
 | 
						|
      pthread_create(&blas_threads[i], &attr, 
 | 
						|
		     (void *)&blas_thread_server, (void *)i);
 | 
						|
#else
 | 
						|
      pthread_create(&blas_threads[i], NULL, 
 | 
						|
		     (void *)&blas_thread_server, (void *)i);
 | 
						|
#endif
 | 
						|
    }
 | 
						|
    
 | 
						|
    blas_num_threads = num_threads;
 | 
						|
 | 
						|
    UNLOCK_COMMAND(&server_lock);
 | 
						|
  }
 | 
						|
 | 
						|
#ifndef NO_AFFINITY
 | 
						|
  if(blas_cpu_number == 1 && num_threads > 1){
 | 
						|
    //Restore the thread 0 affinity.
 | 
						|
    gotoblas_set_affinity(0);
 | 
						|
  }
 | 
						|
#endif
 | 
						|
 | 
						|
  blas_cpu_number  = num_threads;
 | 
						|
 | 
						|
#if defined(ARCH_MIPS64) 
 | 
						|
  //set parameters for different number of threads.
 | 
						|
  blas_set_parameter();
 | 
						|
#endif
 | 
						|
 | 
						|
}
 | 
						|
 | 
						|
void openblas_set_num_threads(int num_threads) {
 | 
						|
	goto_set_num_threads(num_threads);
 | 
						|
	
 | 
						|
}
 | 
						|
 | 
						|
/* Compatible function with pthread_create / join */
 | 
						|
 | 
						|
int gotoblas_pthread(int numthreads, void *function, void *args, int stride) {
 | 
						|
 | 
						|
  blas_queue_t queue[MAX_CPU_NUMBER];
 | 
						|
  int i;
 | 
						|
 | 
						|
  if (numthreads <= 0) return 0;
 | 
						|
 | 
						|
#ifdef SMP
 | 
						|
  if (blas_cpu_number == 0) blas_get_cpu_number();
 | 
						|
#ifdef SMP_SERVER
 | 
						|
  if (blas_server_avail == 0) blas_thread_init();
 | 
						|
#endif
 | 
						|
#endif
 | 
						|
 | 
						|
  for (i = 0; i < numthreads; i ++) {
 | 
						|
 | 
						|
    queue[i].mode    = BLAS_PTHREAD;
 | 
						|
    queue[i].routine = function;
 | 
						|
    queue[i].args    = args;
 | 
						|
    queue[i].range_m = NULL;
 | 
						|
    queue[i].range_n = NULL;
 | 
						|
    queue[i].sa	     = args;
 | 
						|
    queue[i].sb	     = args;
 | 
						|
    queue[i].next    = &queue[i + 1];
 | 
						|
 | 
						|
    args += stride;
 | 
						|
  }
 | 
						|
  
 | 
						|
  queue[numthreads - 1].next = NULL;
 | 
						|
    
 | 
						|
  exec_blas(numthreads, queue);
 | 
						|
  
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
/* Shutdown procedure, but user don't have to call this routine. The */
 | 
						|
/* kernel automatically kill threads.                                */
 | 
						|
 | 
						|
int BLASFUNC(blas_thread_shutdown)(void){
 | 
						|
 | 
						|
  int i;
 | 
						|
 | 
						|
  if (!blas_server_avail) return 0;
 | 
						|
  
 | 
						|
  LOCK_COMMAND(&server_lock);
 | 
						|
 | 
						|
  for (i = 0; i < blas_num_threads - 1; i++) {
 | 
						|
 | 
						|
    blas_lock(&exec_queue_lock);
 | 
						|
  
 | 
						|
    thread_status[i].queue = (blas_queue_t *)-1;
 | 
						|
 | 
						|
    blas_unlock(&exec_queue_lock);
 | 
						|
  
 | 
						|
    pthread_mutex_lock  (&thread_status[i].lock);
 | 
						|
 | 
						|
    thread_status[i].status = THREAD_STATUS_WAKEUP;
 | 
						|
 | 
						|
    pthread_cond_signal (&thread_status[i].wakeup);
 | 
						|
 | 
						|
    pthread_mutex_unlock(&thread_status[i].lock);
 | 
						|
 | 
						|
  }
 | 
						|
 | 
						|
  for(i = 0; i < blas_num_threads - 1; i++){
 | 
						|
    pthread_join(blas_threads[i], NULL);
 | 
						|
  }
 | 
						|
 | 
						|
  for(i = 0; i < blas_num_threads - 1; i++){
 | 
						|
    pthread_mutex_destroy(&thread_status[i].lock);
 | 
						|
    pthread_cond_destroy (&thread_status[i].wakeup);
 | 
						|
  }      
 | 
						|
 | 
						|
#ifdef NEED_STACKATTR
 | 
						|
  pthread_attr_destory(&attr);
 | 
						|
#endif
 | 
						|
 | 
						|
  blas_server_avail = 0;
 | 
						|
  
 | 
						|
  UNLOCK_COMMAND(&server_lock);
 | 
						|
  
 | 
						|
  return 0;
 | 
						|
}
 | 
						|
 | 
						|
#endif
 | 
						|
 |