Merge branch 'develop' into betterPowerGEMVTail

This commit is contained in:
Chip Kerchner 2024-08-14 10:52:46 -05:00
commit 75472b830a
28 changed files with 1845 additions and 1745 deletions

View File

@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 27.dev)
set(OpenBLAS_PATCH_VERSION 28.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
@ -22,6 +22,8 @@ option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS
option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)")
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
@ -30,7 +32,7 @@ option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OF
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
@ -256,6 +258,10 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
endif()
endif()
if (APPLE AND BUILD_SHARED_LIBS)
set(CMAKE_MACOSX_RPATH ON)
endif()
# Seems that this hack doesn't required since macOS 11 Big Sur
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)

View File

@ -1,4 +1,127 @@
OpenBLAS ChangeLog
====================================================================
Version 0.3.28
8-Aug-2024
general:
- Reworked the unfinished implementation of HUGETLB from GotoBLAS
for allocating huge memory pages as buffers on suitable systems
- Changed the unfinished implementation of GEMM3M for the generic
target on all architectures to at least forward to regular GEMM
- Improved multithreaded GEMM performance for large non-skinny matrices
- Improved BLAS3 performance on larger multicore systems through improved
parallelism
- Improved performance of the initial memory allocation by reducing
locking overhead
- Improved performance of GBMV at small problem sizes by introducing
a size barrier for the switch to multithreading
- Added an implementation of the CBLAS_GEMM_BATCH extension
- Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in
CMAKE builds (error introduced in 0.3.27)
- Fixed corner cases involving the handling of NAN and INFINITY
arguments in ?SCAL on all architectures
- Added support for cross-compiling to WEBM with CMAKE (in addition
to the already present makefile support)
- Fixed NAN handling and potential accuracy issues in compilations with
Intel ICX by supplying a suitable fp-model option by default
- The contents of the github project wiki have been converted into
a new set of documentation included with the source code.
- It is now possible to register a callback function that replaces
the built-in support for multithreading with an external backend
like TBB (openblas_set_threads_callback_function)
- Fixed potential duplication of suffixes in shared library naming
- Improved C compiler detection by the build system to tolerate more
naming variants for gcc builds
- Fixed an unnecessary dependency of the utest on CBLAS
- Fixed spurious error reports from the BLAS extensions utest
- Fixed unwanted invocation of the GEMM3M tests in cross-compilation
- Fixed a flaw in the makefile build that could lead to the pkgconfig
file containing an entry of UNKNOWN for the target cpu after installing
- Integrated fixes from the Reference-LAPACK project:
- Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961)
- Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018)
- Fixed potential infinite loop in the LAPACK testsuite (PR 1024)
- Make the variable type used for hidden length arguments configurable (PR 1025)
- Fixed SYTRD workspace computation and various typos (PR 1030)
- Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033)
x86-64:
- reverted thread management under Windows to its state before 0.3.26
due to signs of race conditions in some circumstances now under study
- fixed accidental selection of the unoptimized generic SBGEMM kernel
in CMAKE builds for CooperLake and SapphireRapids targets
- fixed a potential thread buffer overrun in SBSTOBF16 on small systems
- fixed an accuracy issue in ZSCAL introduced in 0.3.26
- fixed compilation with CMAKE and recent releases of LLVM
- added support for Intel Emerald Rapids and Meteor Lake cpus
- added autodetection support for the Zhaoxin KX-7000 cpu
- fixed autodetection of Intel Prescott (probably broken since 0.3.19)
- fixed compilation for older targets with the Yocto SDK
- fixed compilation of the converter-generated C versions
of the LAPACK sources with gcc-14
- improved compiler options when building with CMAKE and LLVM for
AVX512-capable targets
- added support for supplying the L2 cache size via an environment
variable (OPENBLAS_L2_SIZE) in case it is not correctly reported
(as in some VM configurations)
- improved the error message shown when thread creation fails on startup
- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
arm:
- fixed building for baremetal targets with make
arm64:
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
matrix to the corresponding GEMV kernel
- added optimized SGEMV and DGEMV kernels for A64FX
- added optimized SVE kernels for small-matrix GEMM
- added A64FX to the cpu list for DYNAMIC_ARCH
- fixed building with support for cpu affinity
- worked around accuracy problems with C/ZNRM2 on NeoverseN1 and
Apple M targets
- improved GEMM performance on Neoverse V1
- fixed compilation for NEOVERSEN2 with older compilers
- fixed potential miscompilation of the SVE SDOT and DDOT kernels
- fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels
- fixed a potential overflow when using very large user-defined BUFFERSIZE
- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
power:
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
matrix to the corresponding GEMV kernel
- significantly improved performance of SBGEMM on POWER10
- fixed compilation with OpenMP and the XLF compiler
- fixed building of the BLAS extension utests under AIX
- fixed building of parts of the LAPACK testsuite with XLF
- fixed CSWAP/ZSWAP on big-endian POWER10 targets
- fixed a performance regression in SAXPY on POWER10 with OpenXL
- fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM
- fixed building for POWER9 under FreeBSD
- fixed a potential overflow when using very large user-defined BUFFERSIZE
- fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV
riscv64:
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
matrix to the corresponding GEMV kernel
- fixed building for RISCV64_GENERIC with OpenMP enabled
- added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two
RVV 1.0 targets with vector length of 128 and 256)
- worked around the ZVL128B kernels for AXPBY mishandling the special
case of zero Y increment
loongarch64:
- improved GEMM performance on servers of the 3C5000 generation
- improved performance and stability of DGEMM
- improved GEMV and TRSM kernels for LSX and LASX vector ABIs
- fixed CMAKE compilation with the INTERFACE64 option set
- fixed compilation with CMAKE
- worked around spurious errors flagged by the BLAS3 tests
- worked around a miscompilation of the POTRS utest by gcc 14.1
mips64:
- fixed ASUM and SUM kernels to accept negative step sizes in X
- fixed complex GEMV kernels for MSA
====================================================================
Version 0.3.27
4-Apr-2024

View File

@ -45,6 +45,10 @@ else
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
endif
ifdef LAPACK_STRLEN
LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN)
endif
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install

View File

@ -178,7 +178,7 @@ endif
@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
@echo 'version='$(VERSION) >> "$(PKGFILE)"
@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
@cat openblas.pc.in >> "$(PKGFILE)"

View File

@ -3,7 +3,7 @@
#
# This library's version
VERSION = 0.3.27.dev
VERSION = 0.3.28.dev
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@ -134,6 +134,12 @@ VERSION = 0.3.27.dev
# Build LAPACK Deprecated functions since LAPACK 3.6.0
BUILD_LAPACK_DEPRECATED = 1
# The variable type assumed for the length of character arguments when passing
# data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC
# versions used "int"). Mismatches will not cause runtime failures but may result
# in build warnings or errors when building with link-time optimization (LTO)
# LAPACK_STRLEN=int
# Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1
# Have RecursiveLAPACK actually replace standard LAPACK routines instead of

View File

@ -277,6 +277,12 @@ endif
ifeq ($(ARCH), arm64)
GEMM_GEMV_FORWARD = 1
endif
ifeq ($(ARCH), riscv)
GEMM_GEMV_FORWARD = 1
endif
ifeq ($(ARCH), power)
GEMM_GEMV_FORWARD = 1
endif
ifeq ($(SMALL_MATRIX_OPT), 1)
CCOMMON_OPT += -DSMALL_MATRIX_OPT

View File

@ -58,6 +58,10 @@ if (DYNAMIC_ARCH)
set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
endif ()
if (RISCV64)
set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B)
endif ()
if (X86)
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
endif ()

View File

@ -403,7 +403,7 @@ if (SMALL_MATRIX_OPT)
endif ()
if (DYNAMIC_ARCH)
if (X86 OR X86_64 OR ARM64 OR POWER)
if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
@ -622,6 +622,9 @@ set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}")
#For LAPACK Fortran codes.
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" )
if (LAPACK_STRLEN)
set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}")
endif()
set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")
#Disable -fopenmp for LAPACK Fortran codes on Windows.

View File

@ -111,8 +111,8 @@ typedef struct blas_queue {
struct blas_queue *next;
#if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
// CRITICAL_SECTION lock;
// HANDLE finish;
CRITICAL_SECTION lock;
HANDLE finish;
volatile int finished;
#else
pthread_mutex_t lock;

View File

@ -52,6 +52,8 @@ if (DYNAMIC_ARCH)
list(APPEND COMMON_SOURCES dynamic_arm64.c)
elseif (POWER)
list(APPEND COMMON_SOURCES dynamic_power.c)
elseif (RISCV64)
list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c)
else ()
list(APPEND COMMON_SOURCES dynamic.c)
endif ()

View File

@ -1,4 +1,3 @@
/*********************************************************************/
/* Copyright 2009, 2010 The University of Texas at Austin. */
/* All rights reserved. */
@ -49,41 +48,31 @@
#endif
#endif
#ifdef SMP_DEBUG
# define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
#else
# define MT_TRACE(...)
#endif
/* This is a thread implementation for Win32 lazy implementation */
/* Thread server common information */
typedef struct{
CRITICAL_SECTION lock;
HANDLE filled;
HANDLE killed;
static blas_queue_t *work_queue = NULL;
static HANDLE kickoff_event = NULL;
static CRITICAL_SECTION queue_lock;
blas_queue_t *queue; /* Parameter Pointer */
int shutdown; /* server shutdown flag */
} blas_pool_t;
/* We need this global for checking if initialization is finished. */
int blas_server_avail = 0;
int blas_omp_threads_local = 1;
static void * blas_thread_buffer[MAX_CPU_NUMBER];
/* Local Variables */
static BLASULONG server_lock = 0;
static blas_pool_t pool;
static HANDLE blas_threads [MAX_CPU_NUMBER];
static DWORD blas_threads_id[MAX_CPU_NUMBER];
static volatile int thread_target; // target num of live threads, volatile for cross-thread reads
//Prototypes
static void exec_threads(int , blas_queue_t *, int);
static void adjust_thread_buffers();
//
// Legacy code path
//
static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){
if (!(mode & BLAS_COMPLEX)){
@ -207,395 +196,70 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
}
}
//
// This is a main routine of threads. Each thread waits until job is queued.
//
/* This is a main routine of threads. Each thread waits until job is */
/* queued. */
static DWORD WINAPI blas_thread_server(void *arg){
/* Thread identifier */
#ifdef SMP_DEBUG
BLASLONG cpu = (BLASLONG)arg;
#endif
void *buffer, *sa, *sb;
blas_queue_t *queue;
DWORD action;
HANDLE handles[] = {pool.filled, pool.killed};
MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
/* Each server needs each buffer */
buffer = blas_memory_alloc(2);
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
#endif
while (1){
/* Waiting for Queue */
MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
#endif
// event raised when work is added to the queue
WaitForSingleObject(kickoff_event, INFINITE);
do {
action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
} while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));
if (cpu > thread_target - 2) {
//MT_TRACE("thread [%d] exiting.\n", cpu);
break; // excess thread, so worker thread exits
}
if (action == WAIT_OBJECT_0 + 1) break;
MT_TRACE("Server[%2ld] Got it.\n", cpu);
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
#endif
EnterCriticalSection(&queue_lock);
EnterCriticalSection(&pool.lock);
queue = work_queue;
if (queue)
work_queue = work_queue->next;
queue = pool.queue;
if (queue) pool.queue = queue->next;
LeaveCriticalSection(&queue_lock);
LeaveCriticalSection(&pool.lock);
if (queue) {
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
exec_threads(cpu, queue, 0);
} else {
if (pool.queue) SetEvent(pool.filled);
continue; //if queue == NULL
}
MT_TRACE("Server[%2ld] Finished!\n", cpu);
queue->finished = 1;
}
/* Shutdown procedure */
MT_TRACE("Server[%2ld] Shutdown!\n", cpu);
return 0;
}
//
// Initializing routine
//
int blas_thread_init(void) {
BLASLONG i;
if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
LOCK_COMMAND(&server_lock);
adjust_thread_buffers();
MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
if (!blas_server_avail) {
// create the kickoff Event
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
thread_target = blas_cpu_number;
InitializeCriticalSection(&queue_lock);
for(i = 0; i < blas_cpu_number - 1; i++) {
//MT_TRACE("thread_init: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_server_avail = 1;
}
UNLOCK_COMMAND(&server_lock);
return 0;
}
//
// User can call one of two routines.
// exec_blas_async ... immediately returns after jobs are queued.
// exec_blas ... returns after jobs are finished.
//
int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
#if defined(SMP_SERVER)
// Handle lazy re-init of the thread-pool after a POSIX fork
// on Cygwin or as delayed init when a static library is used
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
blas_queue_t *current;
current = queue;
while (current) {
current -> position = pos;
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("fnstcw %0" : "=m" (current -> x87_mode));
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
#endif
current->finished = 0;
current = current -> next;
pos ++;
}
EnterCriticalSection(&queue_lock);
if (!work_queue)
{
work_queue = queue;
}
else
{
blas_queue_t *queue_item = work_queue;
// find the end of the work queue
while (queue_item->next)
queue_item = queue_item->next;
// add new work to the end
queue_item->next = queue;
}
LeaveCriticalSection(&queue_lock);
SetEvent(kickoff_event);
return 0;
}
//
// Join. Wait for all queued tasks to complete
//
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
MT_TRACE("Synchronization Waiting.\n");
while (num) {
MT_TRACE("Waiting Queue ..\n");
while (!queue->finished)
YIELDING;
queue = queue->next;
num--;
}
MT_TRACE("Completely Done.\n\n");
// if work was added to the queue after this batch we can't sleep the worker threads
// by resetting the event
EnterCriticalSection(&queue_lock);
if (work_queue == NULL)
ResetEvent(kickoff_event);
LeaveCriticalSection(&queue_lock);
return 0;
}
//
// Execute Threads
//
int exec_blas(BLASLONG num, blas_queue_t *queue) {
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
#ifndef ALL_THREADED
int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
#endif
if ((num <= 0) || (queue == NULL)) return 0;
//Redirect to caller's callback routine
if (openblas_threads_callback_) {
int buf_index = 0, i = 0;
#ifndef USE_SIMPLE_THREADED_LEVEL3
for (i = 0; i < num; i ++)
queue[i].position = i;
#endif
openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index);
return 0;
}
if ((num > 1) && queue -> next)
exec_blas_async(1, queue -> next);
routine = queue -> routine;
if (queue -> mode & BLAS_LEGACY) {
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
} else {
if (queue -> mode & BLAS_PTHREAD) {
void (*pthreadcompat)(void *) = queue -> routine;
(pthreadcompat)(queue -> args);
} else
(routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0);
}
if ((num > 1) && queue -> next)
exec_blas_async_wait(num - 1, queue -> next);
return 0;
}
//
// Shutdown procedure, but user don't have to call this routine. The
// kernel automatically kill threads.
//
int BLASFUNC(blas_thread_shutdown)(void) {
int i;
if (!blas_server_avail) return 0;
LOCK_COMMAND(&server_lock);
//Free buffers allocated for threads
for(i=0; i<MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i]!=NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i]=NULL;
}
}
if (blas_server_avail) {
for (i = 0; i < blas_num_threads - 1; i++) {
// Could also just use WaitForMultipleObjects
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
if (WAIT_OBJECT_0 != wait_thread_value) {
TerminateThread(blas_threads[i],0);
}
#endif
CloseHandle(blas_threads[i]);
}
blas_server_avail = 0;
}
UNLOCK_COMMAND(&server_lock);
return 0;
}
//
// Legacy function to set numbef of threads
//
void goto_set_num_threads(int num_threads)
{
long i;
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_cpu_number;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (blas_server_avail && num_threads < blas_num_threads) {
LOCK_COMMAND(&server_lock);
thread_target = num_threads;
SetEvent(kickoff_event);
for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
WaitForSingleObject(blas_threads[i], INFINITE);
//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
CloseHandle(blas_threads[i]);
}
blas_num_threads = num_threads;
ResetEvent(kickoff_event);
UNLOCK_COMMAND(&server_lock);
}
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
thread_target = num_threads;
//increased_threads = 1;
if (!blas_server_avail) {
// create the kickoff Event
kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
InitializeCriticalSection(&queue_lock);
blas_server_avail = 1;
}
for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_num_threads = num_threads;
UNLOCK_COMMAND(&server_lock);
}
blas_cpu_number = num_threads;
}
//
// Openblas function to set thread count
//
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}
static void adjust_thread_buffers() {
int i=0;
//adjust buffer for each thread
for(i=0; i < blas_cpu_number; i++){
if(blas_thread_buffer[i] == NULL){
blas_thread_buffer[i] = blas_memory_alloc(2);
}
}
for(; i < MAX_CPU_NUMBER; i++){
if(blas_thread_buffer[i] != NULL){
blas_memory_free(blas_thread_buffer[i]);
blas_thread_buffer[i] = NULL;
}
}
}
//Indivitual threads work executor, Helps in setting by synchronization environment and calling inner_threads routine
static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) {
void *buffer, *sa, *sb;
buffer = blas_thread_buffer[cpu];
sa = queue -> sa;
sb = queue -> sb;
int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
__asm__ __volatile__ ("fldcw %0" : : "m" (queue -> x87_mode));
#endif
MT_TRACE("Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Started. Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
#endif
// fprintf(stderr, "queue start[%ld]!!!\n", cpu);
@ -603,8 +267,7 @@ static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) {
main_status[cpu] = MAIN_RUNNING1;
#endif
if (sa == NULL)
sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
if (sb == NULL) {
if (!(queue -> mode & BLAS_COMPLEX)){
@ -656,9 +319,271 @@ if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
#endif
if (!(queue -> mode & BLAS_LEGACY)) {
(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
} else {
legacy_exec(routine, queue -> mode, queue -> args, sb);
}
}else{
continue; //if queue == NULL
}
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
#endif
EnterCriticalSection(&queue->lock);
queue -> status = BLAS_STATUS_FINISHED;
LeaveCriticalSection(&queue->lock);
SetEvent(queue->finish);
}
/* Shutdown procedure */
#ifdef SMP_DEBUG
fprintf(STDERR, "Server[%2ld] Shutdown!\n", cpu);
#endif
blas_memory_free(buffer);
return 0;
}
/* Initializing routine */
int blas_thread_init(void){
BLASLONG i;
if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
LOCK_COMMAND(&server_lock);
#ifdef SMP_DEBUG
fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
blas_cpu_number);
#endif
if (!blas_server_avail){
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
pool.shutdown = 0;
pool.queue = NULL;
for(i = 0; i < blas_cpu_number - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_server_avail = 1;
}
UNLOCK_COMMAND(&server_lock);
return 0;
}
/*
User can call one of two routines.
exec_blas_async ... immediately returns after jobs are queued.
exec_blas ... returns after jobs are finished.
*/
int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
#if defined(SMP_SERVER)
// Handle lazy re-init of the thread-pool after a POSIX fork
// on Cygwin or as delayed init when a static library is used
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
blas_queue_t *current;
current = queue;
while (current) {
InitializeCriticalSection(&current -> lock);
current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
current -> position = pos;
#ifdef CONSISTENT_FPCSR
__asm__ __volatile__ ("fnstcw %0" : "=m" (current -> x87_mode));
__asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
#endif
current = current -> next;
pos ++;
}
EnterCriticalSection(&pool.lock);
if (pool.queue) {
current = pool.queue;
while (current -> next) current = current -> next;
current -> next = queue;
} else {
pool.queue = queue;
}
LeaveCriticalSection(&pool.lock);
SetEvent(pool.filled);
return 0;
}
int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
#ifdef SMP_DEBUG
fprintf(STDERR, "Synchronization Waiting.\n");
#endif
while (num){
#ifdef SMP_DEBUG
fprintf(STDERR, "Waiting Queue ..\n");
#endif
WaitForSingleObject(queue->finish, INFINITE);
CloseHandle(queue->finish);
DeleteCriticalSection(&queue -> lock);
queue = queue -> next;
num --;
}
#ifdef SMP_DEBUG
fprintf(STDERR, "Completely Done.\n\n");
#endif
return 0;
}
/* Execute Threads */
int exec_blas(BLASLONG num, blas_queue_t *queue){
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
#ifndef ALL_THREADED
int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
#endif
if ((num <= 0) || (queue == NULL)) return 0;
if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
routine = queue -> routine;
if (queue -> mode & BLAS_LEGACY) {
legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
} else
if (queue -> mode & BLAS_PTHREAD) {
void (*pthreadcompat)(void *) = queue -> routine;
(pthreadcompat)(queue -> args);
} else
(routine)(queue -> args, queue -> range_m, queue -> range_n,
queue -> sa, queue -> sb, 0);
if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
return 0;
}
/* Shutdown procedure, but user don't have to call this routine. The */
/* kernel automatically kill threads. */
int BLASFUNC(blas_thread_shutdown)(void){
int i;
if (!blas_server_avail) return 0;
LOCK_COMMAND(&server_lock);
if (blas_server_avail){
SetEvent(pool.killed);
for(i = 0; i < blas_num_threads - 1; i++){
// Could also just use WaitForMultipleObjects
DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
#ifndef OS_WINDOWSSTORE
// TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
if (WAIT_OBJECT_0 != wait_thread_value) {
TerminateThread(blas_threads[i],0);
}
#endif
CloseHandle(blas_threads[i]);
}
CloseHandle(pool.filled);
CloseHandle(pool.killed);
blas_server_avail = 0;
}
UNLOCK_COMMAND(&server_lock);
return 0;
}
void goto_set_num_threads(int num_threads)
{
long i;
#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
// Handle lazy re-init of the thread-pool after a POSIX fork
if (unlikely(blas_server_avail == 0)) blas_thread_init();
#endif
if (num_threads < 1) num_threads = blas_cpu_number;
if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
if (num_threads > blas_num_threads) {
LOCK_COMMAND(&server_lock);
//increased_threads = 1;
if (!blas_server_avail){
InitializeCriticalSection(&pool.lock);
pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
pool.killed = CreateEvent(NULL, TRUE, FALSE, NULL);
pool.shutdown = 0;
pool.queue = NULL;
blas_server_avail = 1;
}
for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
blas_threads[i] = CreateThread(NULL, 0,
blas_thread_server, (void *)i,
0, &blas_threads_id[i]);
}
blas_num_threads = num_threads;
UNLOCK_COMMAND(&server_lock);
}
blas_cpu_number = num_threads;
}
void openblas_set_num_threads(int num)
{
goto_set_num_threads(num);
}

View File

@ -2769,7 +2769,7 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER
alloc_devicedirver,
#endif
#ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB)
#if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)
alloc_shm,
#endif
#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))

View File

@ -498,7 +498,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
args.m, args.n, args.k, args.lda, args.ldb, args.ldc);
#endif
#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX)
#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16)
// Check if we can convert GEMM -> GEMV
if (args.k != 0) {
if (args.n == 1) {

View File

@ -17,15 +17,6 @@ ifeq ($(ARCH), ia64)
USE_GEMM3M = 1
endif
ifneq ($(DYNAMIC_ARCH), 1)
ifeq ($(TARGET), GENERIC)
USE_GEMM3M = 0
endif
else
ifeq ($(CORE), GENERIC)
USE_GEMM3M = 0
endif
endif
ifeq ($(ARCH), arm)
USE_TRMM = 1

View File

@ -25,10 +25,16 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
#if 1
#include "zgemmkernel_2x2.c"
#else
#include "common.h"
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc)
{
return 0;
}
#endif

View File

@ -242,4 +242,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
}
}
return(0);
}

View File

@ -200,4 +200,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
}
}
}
return(0);
}

File diff suppressed because it is too large Load Diff

View File

@ -163,7 +163,8 @@
*> \endverbatim
*>
* =====================================================================
SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO )
SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK,
$ INFO )
*
* -- LAPACK computational routine --
* -- LAPACK is a software package provided by Univ. of Tennessee, --
@ -193,7 +194,8 @@
COMPLEX EI
* ..
* .. External Subroutines ..
EXTERNAL CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB, CTRMM,
EXTERNAL CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB,
$ CTRMM,
$ XERBLA
* ..
* .. Intrinsic Functions ..
@ -230,7 +232,7 @@
IF( NH.LE.1 ) THEN
LWKOPT = 1
ELSE
NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI,
NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI,
$ -1 ) )
LWKOPT = N*NB + TSIZE
END IF

View File

@ -139,7 +139,7 @@
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \ingroup complexHEcomputational
*> \ingroup hetrd
*
*> \par Further Details:
* =====================
@ -188,7 +188,8 @@
*> \endverbatim
*>
* =====================================================================
SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO )
SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
$ INFO )
*
* -- LAPACK computational routine --
* -- LAPACK is a software package provided by Univ. of Tennessee, --
@ -225,7 +226,8 @@
* .. External Functions ..
LOGICAL LSAME
INTEGER ILAENV
EXTERNAL LSAME, ILAENV
REAL SROUNDUP_LWORK
EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK
* ..
* .. Executable Statements ..
*
@ -249,8 +251,8 @@
* Determine the block size.
*
NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
LWKOPT = N*NB
WORK( 1 ) = LWKOPT
LWKOPT = MAX( 1, N*NB )
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
END IF
*
IF( INFO.NE.0 ) THEN
@ -367,7 +369,7 @@
$ TAU( I ), IINFO )
END IF
*
WORK( 1 ) = LWKOPT
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
RETURN
*
* End of CHETRD

View File

@ -109,7 +109,7 @@
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \ingroup doubleOTHERauxiliary
*> \ingroup lanv2
*
*> \par Further Details:
* =====================
@ -248,10 +248,14 @@
*
* Compute [ A B ] = [ CS SN ] [ AA BB ]
* [ C D ] [-SN CS ] [ CC DD ]
*
* Note: Some of the multiplications are wrapped in parentheses to
* prevent compilers from using FMA instructions. See
* https://github.com/Reference-LAPACK/lapack/issues/1031.
*
A = AA*CS + CC*SN
B = BB*CS + DD*SN
C = -AA*SN + CC*CS
B = ( BB*CS ) + ( DD*SN )
C = -( AA*SN ) + ( CC*CS )
D = -BB*SN + DD*CS
*
TEMP = HALF*( A+D )

View File

@ -18,7 +18,7 @@
*>
*> \verbatim
*>
*> DGELQT computes a blocked LQ factorization of a real M-by-N matrix A
*> SGELQT computes a blocked LQ factorization of a real M-by-N matrix A
*> using the compact WY representation of Q.
*> \endverbatim
*
@ -93,7 +93,7 @@
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \ingroup doubleGEcomputational
*> \ingroup gelqt
*
*> \par Further Details:
* =====================

View File

@ -74,7 +74,7 @@
*> A is REAL array, dimension
*> (LDA,M) if SIDE = 'L',
*> (LDA,N) if SIDE = 'R'
*> Part of the data structure to represent Q as returned by DGELQ.
*> Part of the data structure to represent Q as returned by SGELQ.
*> \endverbatim
*>
*> \param[in] LDA

View File

@ -20,7 +20,7 @@
*>
*> \verbatim
*>
*> DGEMLQT overwrites the general real M-by-N matrix C with
*> SGEMLQT overwrites the general real M-by-N matrix C with
*>
*> SIDE = 'L' SIDE = 'R'
*> TRANS = 'N': Q C C Q
@ -145,7 +145,7 @@
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \ingroup doubleGEcomputational
*> \ingroup gemlqt
*
* =====================================================================
SUBROUTINE SGEMLQT( SIDE, TRANS, M, N, K, MB, V, LDV, T, LDT,

View File

@ -109,7 +109,7 @@
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \ingroup realOTHERauxiliary
*> \ingroup lanv2
*
*> \par Further Details:
* =====================
@ -248,10 +248,14 @@
*
* Compute [ A B ] = [ CS SN ] [ AA BB ]
* [ C D ] [-SN CS ] [ CC DD ]
*
* Note: Some of the multiplications are wrapped in parentheses to
* prevent compilers from using FMA instructions. See
* https://github.com/Reference-LAPACK/lapack/issues/1031.
*
A = AA*CS + CC*SN
B = BB*CS + DD*SN
C = -AA*SN + CC*CS
B = ( BB*CS ) + ( DD*SN )
C = -( AA*SN ) + ( CC*CS )
D = -BB*SN + DD*CS
*
TEMP = HALF*( A+D )

View File

@ -188,7 +188,8 @@
*> \endverbatim
*>
* =====================================================================
SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO )
SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
$ INFO )
*
* -- LAPACK computational routine --
* -- LAPACK is a software package provided by Univ. of Tennessee, --
@ -248,7 +249,7 @@
* Determine the block size.
*
NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
LWKOPT = N*NB
LWKOPT = MAX( 1, N*NB )
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
END IF
*
@ -316,7 +317,8 @@
* Update the unreduced submatrix A(1:i-1,1:i-1), using an
* update of the form: A := A - V*W**T - W*V**T
*
CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1, I ),
CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1,
$ I ),
$ LDA, WORK, LDWORK, ONE, A, LDA )
*
* Copy superdiagonal elements back into A, and diagonal

View File

@ -139,7 +139,7 @@
*> \author Univ. of Colorado Denver
*> \author NAG Ltd.
*
*> \ingroup complex16HEcomputational
*> \ingroup hetrd
*
*> \par Further Details:
* =====================
@ -188,7 +188,8 @@
*> \endverbatim
*>
* =====================================================================
SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO )
SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
$ INFO )
*
* -- LAPACK computational routine --
* -- LAPACK is a software package provided by Univ. of Tennessee, --
@ -249,7 +250,7 @@
* Determine the block size.
*
NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 )
LWKOPT = N*NB
LWKOPT = MAX( 1, N*NB )
WORK( 1 ) = LWKOPT
END IF
*

View File

@ -189,8 +189,11 @@ endif
endif
ifeq ($(SUPPORT_GEMM3M),1)
level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3_3m
else
level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
endif
ifneq ($(CROSS), 1)
rm -f ?BLAT3.SUMM