Merge branch 'develop' into betterPowerGEMVTail

2024-08-14 10:52:46 -05:00 · 2024-08-14 10:52:46 -05:00 · 75472b830a
parent 1a7b8c650d cd3945b998
commit 75472b830a
28 changed files with 1845 additions and 1745 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -8,7 +8,7 @@ project(OpenBLAS C ASM)

 set(OpenBLAS_MAJOR_VERSION 0)
 set(OpenBLAS_MINOR_VERSION 3)
-set(OpenBLAS_PATCH_VERSION 27.dev)
+set(OpenBLAS_PATCH_VERSION 28.dev)

 set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")

@ -22,6 +22,8 @@ option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS

 option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)

+set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)")
+
 option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)

 option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
@ -30,7 +32,7 @@ option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OF

 option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)

-option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF)
+option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF)

 option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)

@ -256,6 +258,10 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
  endif()
 endif()

+if (APPLE AND BUILD_SHARED_LIBS)
+set(CMAKE_MACOSX_RPATH ON)
+endif()
+
 # Seems that this hack doesn't required since macOS 11 Big Sur
 if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
  set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)
--- a/Changelog.txt
+++ b/Changelog.txt
@ -1,4 +1,127 @@
 OpenBLAS ChangeLog
+====================================================================
+Version 0.3.28
+ 8-Aug-2024
+
+general:
+- Reworked the unfinished implementation of HUGETLB from GotoBLAS
+  for allocating huge memory pages as buffers on suitable systems
+- Changed the unfinished implementation of GEMM3M for the generic
+  target on all architectures to at least forward to regular GEMM
+- Improved multithreaded GEMM performance for large non-skinny matrices
+- Improved BLAS3 performance on larger multicore systems through improved
+  parallelism
+- Improved performance of the initial memory allocation by reducing
+  locking overhead
+- Improved performance of GBMV at small problem sizes by introducing
+  a size barrier for the switch to multithreading
+- Added an implementation of the CBLAS_GEMM_BATCH extension
+- Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in 
+  CMAKE builds (error introduced in 0.3.27)
+- Fixed corner cases involving the handling of NAN and INFINITY
+  arguments in ?SCAL on all architectures
+- Added support for cross-compiling to WEBM with CMAKE (in addition
+  to the already present makefile support)
+- Fixed NAN handling and potential accuracy issues in compilations with
+  Intel ICX by supplying a suitable fp-model option by default
+- The contents of the github project wiki have been converted into
+  a new set of documentation included with the source code.
+- It is now possible to register a callback function that replaces
+  the built-in support for multithreading with an external backend
+  like TBB (openblas_set_threads_callback_function)
+- Fixed potential duplication of suffixes in shared library naming
+- Improved C compiler detection by the build system to tolerate more
+  naming variants for gcc builds
+- Fixed an unnecessary dependency of the utest on CBLAS
+- Fixed spurious error reports from the BLAS extensions utest
+- Fixed unwanted invocation of the GEMM3M tests in cross-compilation
+- Fixed a flaw in the makefile build that could lead to the pkgconfig
+  file containing an entry of UNKNOWN for the target cpu after installing
+- Integrated fixes from the Reference-LAPACK project:
+  - Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961)
+  - Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018)
+  - Fixed potential infinite loop in the LAPACK testsuite (PR 1024)
+  - Make the variable type used for hidden length arguments configurable (PR 1025)  
+  - Fixed SYTRD workspace computation and various typos (PR 1030)
+  - Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033)
+
+x86-64:
+- reverted thread management under Windows to its state before 0.3.26
+  due to signs of race conditions in some circumstances now under study
+- fixed accidental selection of the unoptimized generic SBGEMM kernel
+  in CMAKE builds for CooperLake and SapphireRapids targets
+- fixed a potential thread buffer overrun in SBSTOBF16 on small systems
+- fixed an accuracy issue in ZSCAL introduced in 0.3.26
+- fixed compilation with CMAKE and recent releases of LLVM
+- added support for Intel Emerald Rapids and Meteor Lake cpus
+- added autodetection support for the Zhaoxin KX-7000 cpu
+- fixed autodetection of Intel Prescott (probably broken since 0.3.19)
+- fixed compilation for older targets with the Yocto SDK
+- fixed compilation of the converter-generated C versions
+  of the LAPACK sources with gcc-14
+- improved compiler options when building with CMAKE and LLVM for
+  AVX512-capable targets
+- added support for supplying the L2 cache size via an environment
+  variable (OPENBLAS_L2_SIZE) in case it is not correctly reported
+  (as in some VM configurations)
+- improved the error message shown when thread creation fails on startup
+- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
+
+arm:
+- fixed building for baremetal targets with make
+
+arm64:
+- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
+  matrix to the corresponding GEMV kernel 
+- added optimized SGEMV and DGEMV kernels for A64FX
+- added optimized SVE kernels for small-matrix GEMM
+- added A64FX to the cpu list for DYNAMIC_ARCH
+- fixed building with support for cpu affinity
+- worked around accuracy problems with C/ZNRM2 on NeoverseN1 and
+  Apple M targets
+- improved GEMM performance on Neoverse V1
+- fixed compilation for NEOVERSEN2 with older compilers
+- fixed potential miscompilation of the SVE SDOT and DDOT kernels
+- fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels
+- fixed a potential overflow when using very large user-defined BUFFERSIZE
+- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
+
+power:
+- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
+  matrix to the corresponding GEMV kernel 
+- significantly improved performance of SBGEMM on POWER10
+- fixed compilation with OpenMP and the XLF compiler
+- fixed building of the BLAS extension utests under AIX
+- fixed building of parts of the LAPACK testsuite with XLF
+- fixed CSWAP/ZSWAP on big-endian POWER10 targets
+- fixed a performance regression in SAXPY on POWER10 with OpenXL
+- fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM
+- fixed building for POWER9 under FreeBSD
+- fixed a potential overflow when using very large user-defined BUFFERSIZE
+- fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV
+
+riscv64:
+- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
+  matrix to the corresponding GEMV kernel 
+- fixed building for RISCV64_GENERIC with OpenMP enabled
+- added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two
+  RVV 1.0 targets with vector length of 128 and 256)
+- worked around the ZVL128B kernels for AXPBY mishandling the special
+  case of zero Y increment
+
+loongarch64:
+- improved GEMM performance on servers of the 3C5000 generation
+- improved performance and stability of DGEMM
+- improved GEMV and TRSM kernels for LSX and LASX vector ABIs
+- fixed CMAKE compilation with the INTERFACE64 option set
+- fixed compilation with CMAKE
+- worked around spurious errors flagged by the BLAS3 tests
+- worked around a miscompilation of the POTRS utest by gcc 14.1
+
+mips64:
+- fixed ASUM and SUM kernels to accept negative step sizes in X
+- fixed complex GEMV kernels for MSA
+
 ====================================================================
 Version 0.3.27
 4-Apr-2024
--- a/4
+++ b/4
@ -45,6 +45,10 @@ else
 LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
 endif

+ifdef LAPACK_STRLEN
+LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN)
+endif
+
 SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test

 .PHONY : all libs netlib $(RELA) test ctest shared install
--- a/Makefile.install
+++ b/Makefile.install
@ -178,7 +178,7 @@ endif
 	@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
 	@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
 	@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
-	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
+	@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
 	@echo 'version='$(VERSION) >> "$(PKGFILE)"
 	@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
 	@cat openblas.pc.in >> "$(PKGFILE)"
--- a/Makefile.rule
+++ b/Makefile.rule
@ -3,7 +3,7 @@
 #

 # This library's version
-VERSION = 0.3.27.dev
+VERSION = 0.3.28.dev

 # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
 # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@ -134,6 +134,12 @@ VERSION = 0.3.27.dev
 # Build LAPACK Deprecated functions since LAPACK 3.6.0
 BUILD_LAPACK_DEPRECATED = 1

+# The variable type assumed for the length of character arguments when passing
+# data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC
+# versions used "int"). Mismatches will not cause runtime failures but may result
+# in build warnings or errors when building with link-time optimization (LTO)
+# LAPACK_STRLEN=int
+
 # Build RecursiveLAPACK on top of LAPACK
 # BUILD_RELAPACK = 1
 # Have RecursiveLAPACK actually replace standard LAPACK routines instead of 
--- a/Makefile.system
+++ b/Makefile.system
@ -277,6 +277,12 @@ endif
 ifeq ($(ARCH), arm64)
 GEMM_GEMV_FORWARD = 1
 endif
+ifeq ($(ARCH), riscv)
+GEMM_GEMV_FORWARD = 1
+endif
+ifeq ($(ARCH), power)
+GEMM_GEMV_FORWARD = 1
+endif

 ifeq ($(SMALL_MATRIX_OPT), 1)
 CCOMMON_OPT += -DSMALL_MATRIX_OPT
--- a/cmake/arch.cmake
+++ b/cmake/arch.cmake
@ -58,6 +58,10 @@ if (DYNAMIC_ARCH)
 	  set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
  endif ()
 
+  if (RISCV64)
+	  set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B) 
+  endif ()
+
  if (X86)
    set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
  endif ()
--- a/cmake/system.cmake
+++ b/cmake/system.cmake
@ -403,7 +403,7 @@ if (SMALL_MATRIX_OPT)
 endif ()

 if (DYNAMIC_ARCH)
-  if (X86 OR X86_64 OR ARM64 OR POWER)
+  if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64)
    set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
    if (DYNAMIC_OLDER)
      set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
@ -622,6 +622,9 @@ set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}")

 #For LAPACK Fortran codes.
 set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" )
+if (LAPACK_STRLEN)
+	set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}")
+endif()
 set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")

 #Disable -fopenmp for LAPACK Fortran codes on Windows.
--- a/common_thread.h
+++ b/common_thread.h
@ -111,8 +111,8 @@ typedef struct blas_queue {
  struct blas_queue *next;

 #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
-  // CRITICAL_SECTION lock;
-  // HANDLE finish;
+   CRITICAL_SECTION lock;
+   HANDLE finish;
  volatile int finished;
 #else
  pthread_mutex_t	 lock;
--- a/driver/others/CMakeLists.txt
+++ b/driver/others/CMakeLists.txt
@ -52,6 +52,8 @@ if (DYNAMIC_ARCH)
    list(APPEND COMMON_SOURCES dynamic_arm64.c)
  elseif (POWER)
    list(APPEND COMMON_SOURCES dynamic_power.c)
+  elseif (RISCV64)
+    list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c)
  else ()  
    list(APPEND COMMON_SOURCES dynamic.c)
  endif ()  
--- a/driver/others/blas_server_win32.c
+++ b/driver/others/blas_server_win32.c
@ -1,4 +1,3 @@
-
 /*********************************************************************/
 /* Copyright 2009, 2010 The University of Texas at Austin.           */
 /* All rights reserved.                                              */
@ -49,41 +48,31 @@
 #endif
 #endif

-#ifdef SMP_DEBUG
-#   define MT_TRACE(...) fprintf(stderr, __VA_ARGS__)
-#else
-#   define MT_TRACE(...)
-#endif
-
 /* This is a thread implementation for Win32 lazy implementation */

 /* Thread server common information */
+typedef struct{
+  CRITICAL_SECTION lock;
+  HANDLE filled;
+  HANDLE killed;

-static blas_queue_t *work_queue = NULL;
-static HANDLE kickoff_event = NULL;
-static CRITICAL_SECTION queue_lock;
+  blas_queue_t	*queue;    /* Parameter Pointer */
+  int		shutdown;  /* server shutdown flag */
+
+} blas_pool_t;

 /* We need this global for checking if initialization is finished.   */
 int blas_server_avail = 0;
-
 int blas_omp_threads_local = 1;
-
-static void * blas_thread_buffer[MAX_CPU_NUMBER];
-
 /* Local Variables */
 static BLASULONG server_lock       = 0;

+static blas_pool_t   pool;
 static HANDLE	    blas_threads   [MAX_CPU_NUMBER];
 static DWORD	    blas_threads_id[MAX_CPU_NUMBER];
-static volatile int thread_target;	// target num of live threads, volatile for cross-thread reads

-//Prototypes
-static void exec_threads(int , blas_queue_t *, int);
-static void adjust_thread_buffers();

-//
-// Legacy code path
-//
+
 static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb){

      if (!(mode & BLAS_COMPLEX)){
@ -207,395 +196,70 @@ static void legacy_exec(void *func, int mode, blas_arg_t *args, void *sb) {
      }
 }

-//
-// This is a main routine of threads. Each thread waits until job is queued.
-//
+/* This is a main routine of threads. Each thread waits until job is */
+/* queued.                                                           */
+
 static DWORD WINAPI blas_thread_server(void *arg){

  /* Thread identifier */
+#ifdef SMP_DEBUG
  BLASLONG  cpu = (BLASLONG)arg;
+#endif

+  void *buffer, *sa, *sb;
  blas_queue_t	*queue;
+  DWORD action;
+  HANDLE handles[] = {pool.filled, pool.killed};

-  MT_TRACE("Server[%2ld] Thread is started!\n", cpu);
+  /* Each server needs each buffer */
+  buffer   = blas_memory_alloc(2);
+
+#ifdef SMP_DEBUG
+  fprintf(STDERR, "Server[%2ld] Thread is started!\n", cpu);
+#endif

  while (1){

    /* Waiting for Queue */

-    MT_TRACE("Server[%2ld] Waiting for Queue.\n", cpu);
+#ifdef SMP_DEBUG
+    fprintf(STDERR, "Server[%2ld] Waiting for Queue.\n", cpu);
+#endif

-    // event raised when work is added to the queue
-    WaitForSingleObject(kickoff_event, INFINITE);
+    do {
+      action = WaitForMultipleObjects(2, handles, FALSE, INFINITE);
+    } while ((action != WAIT_OBJECT_0) && (action != WAIT_OBJECT_0 + 1));

-    if (cpu > thread_target - 2) {
-      //MT_TRACE("thread [%d] exiting.\n", cpu);
-      break;	// excess thread, so worker thread exits
-    }
+    if (action == WAIT_OBJECT_0 + 1) break;

-    MT_TRACE("Server[%2ld] Got it.\n", cpu);
+#ifdef SMP_DEBUG
+    fprintf(STDERR, "Server[%2ld] Got it.\n", cpu);
+#endif

-    EnterCriticalSection(&queue_lock);
+    EnterCriticalSection(&pool.lock);

-    queue = work_queue;
-    if (queue)
-        work_queue = work_queue->next;
+    queue = pool.queue;
+    if (queue) pool.queue = queue->next;

-    LeaveCriticalSection(&queue_lock);
+    LeaveCriticalSection(&pool.lock);

    if (queue)  {
+      int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;

-    exec_threads(cpu, queue, 0);
-    } else {
+      if (pool.queue) SetEvent(pool.filled);

-        continue; //if queue == NULL
-    }
-    
-    MT_TRACE("Server[%2ld] Finished!\n", cpu);
-	
-	  queue->finished = 1;
-  }
-
-  /* Shutdown procedure */
-
-  MT_TRACE("Server[%2ld] Shutdown!\n",  cpu);
-
-  return 0;
-}
-
-//
-// Initializing routine
-//
-int blas_thread_init(void) {
-  BLASLONG i;
-
-  if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
-
-  LOCK_COMMAND(&server_lock);
-
-  adjust_thread_buffers();
-
-  MT_TRACE("Initializing Thread(Num. threads = %d)\n", blas_cpu_number);
-
-  if (!blas_server_avail) {
-    // create the kickoff Event
-    kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
-
-    thread_target = blas_cpu_number;
-
-    InitializeCriticalSection(&queue_lock);
-
-    for(i = 0; i < blas_cpu_number - 1; i++) {
-	    //MT_TRACE("thread_init: creating thread [%d]\n", i);
-
-      blas_threads[i] = CreateThread(NULL, 0,
-				     blas_thread_server, (void *)i,
-				     0, &blas_threads_id[i]);
-    }
-
-    blas_server_avail = 1;
-  }
-
-  UNLOCK_COMMAND(&server_lock);
-
-  return 0;
-}
-
-//
-//   User can call one of two routines.
-//     exec_blas_async ... immediately returns after jobs are queued.
-//     exec_blas       ... returns after jobs are finished.
-//
-int exec_blas_async(BLASLONG pos, blas_queue_t *queue) {
-
-#if defined(SMP_SERVER)
-  // Handle lazy re-init of the thread-pool after a POSIX fork
-  // on Cygwin or as delayed init when a static library	is used
-  if (unlikely(blas_server_avail == 0)) blas_thread_init();
-#endif
-
-  blas_queue_t *current;
-
-  current = queue;
-
-  while (current) {
-    current -> position = pos;
-
-#ifdef CONSISTENT_FPCSR
-    __asm__ __volatile__ ("fnstcw %0"  : "=m" (current -> x87_mode));
-    __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
-#endif
-
-  	current->finished = 0;
-    current = current -> next;
-    pos ++;
-  }
-
-  EnterCriticalSection(&queue_lock);
-
-  if (!work_queue)
-  {
-    work_queue = queue;
-  }
-  else
-  {
-	  blas_queue_t *queue_item = work_queue;
-
-    // find the end of the work queue
-    while (queue_item->next)
-        queue_item = queue_item->next;
-
-    // add new work to the end
-    queue_item->next = queue;
-  }
-
-  LeaveCriticalSection(&queue_lock);
-
-  SetEvent(kickoff_event);
-
-  return 0;
-}
-
-//
-// Join. Wait for all queued tasks to complete
-//
-int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue) {
-
-  MT_TRACE("Synchronization Waiting.\n");
-
-  while (num) {
-    MT_TRACE("Waiting Queue ..\n");
-
-    while (!queue->finished)
-      YIELDING;
-
-    queue = queue->next;
-    num--;
-  }
-
-  MT_TRACE("Completely Done.\n\n");
-
-	// if work was added to the queue after this batch we can't sleep the worker threads
-	// by resetting the event
-	EnterCriticalSection(&queue_lock);
-
-	if (work_queue == NULL)
-		ResetEvent(kickoff_event);
-
-	LeaveCriticalSection(&queue_lock);
-
-	return 0;
-}
-
-//
-// Execute Threads
-//
-int exec_blas(BLASLONG num, blas_queue_t *queue) {
-
-#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
-  // Handle lazy re-init of the thread-pool after a POSIX fork
-  if (unlikely(blas_server_avail == 0)) blas_thread_init();
-#endif
-
-#ifndef ALL_THREADED
-   int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
-#endif
-
-  if ((num <= 0) || (queue == NULL)) return 0;
-
-  //Redirect to caller's callback routine
-  if (openblas_threads_callback_) {
-  int buf_index = 0, i = 0;
-#ifndef USE_SIMPLE_THREADED_LEVEL3
-    for (i = 0; i < num; i ++)
-      queue[i].position = i;
-#endif
-    openblas_threads_callback_(1, (openblas_dojob_callback) exec_threads, num, sizeof(blas_queue_t), (void*) queue, buf_index);
-    return 0;
-  }
-
-  if ((num > 1) && queue -> next) 
-    exec_blas_async(1, queue -> next);
-
-  routine = queue -> routine;
-
-  if (queue -> mode & BLAS_LEGACY) {
-    legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
-  } else {
-    if (queue -> mode & BLAS_PTHREAD) {
-      void (*pthreadcompat)(void *) = queue -> routine;
-      (pthreadcompat)(queue -> args);
-    } else
-      (routine)(queue -> args, queue -> range_m, queue -> range_n,
-    		queue -> sa, queue -> sb, 0);
-  }
-
-  if ((num > 1) && queue -> next) 
-    exec_blas_async_wait(num - 1, queue -> next);
-
-  return 0;
-}
-
-//
-// Shutdown procedure, but user don't have to call this routine. The
-// kernel automatically kill threads.
-//
-int BLASFUNC(blas_thread_shutdown)(void) {
-
-  int i;
-
-  if (!blas_server_avail) return 0;
-
-  LOCK_COMMAND(&server_lock);
-
-  //Free buffers allocated for threads
-  for(i=0; i<MAX_CPU_NUMBER; i++){
-    if(blas_thread_buffer[i]!=NULL){
-      blas_memory_free(blas_thread_buffer[i]);
-      blas_thread_buffer[i]=NULL;
-    }
-  }
-
-  if (blas_server_avail) {
-
-    for (i = 0; i < blas_num_threads - 1; i++) {
-      // Could also just use WaitForMultipleObjects
-      DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
-
-#ifndef OS_WINDOWSSTORE
-      // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
-      if (WAIT_OBJECT_0 != wait_thread_value) {
-        TerminateThread(blas_threads[i],0);
-      }
-#endif
-
-      CloseHandle(blas_threads[i]);
-    }
-
-    blas_server_avail = 0;
-  }
-
-  UNLOCK_COMMAND(&server_lock);
-
-  return 0;
-}
-
-//
-// Legacy function to set numbef of threads
-//
-void goto_set_num_threads(int num_threads)
-{
-	long i;
-
-#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
-	// Handle lazy re-init of the thread-pool after a POSIX fork
-	if (unlikely(blas_server_avail == 0)) blas_thread_init();
-#endif
-
-	if (num_threads < 1) num_threads = blas_cpu_number;
-
-	if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
-
-	if (blas_server_avail && num_threads < blas_num_threads) {
-		LOCK_COMMAND(&server_lock);
-
-		thread_target = num_threads;
-		
-		SetEvent(kickoff_event);
-
-		for (i = num_threads - 1; i < blas_num_threads - 1; i++) {
-			//MT_TRACE("set_num_threads: waiting on thread [%d] to quit.\n", i);
-
-			WaitForSingleObject(blas_threads[i], INFINITE);
-
-			//MT_TRACE("set_num_threads: thread [%d] has quit.\n", i);
-
-			CloseHandle(blas_threads[i]);
-		}
-
-		blas_num_threads = num_threads;
-		
-		ResetEvent(kickoff_event);
-
-		UNLOCK_COMMAND(&server_lock);
-	}
-
-	if (num_threads > blas_num_threads) {
-
-		LOCK_COMMAND(&server_lock);
-
-		thread_target = num_threads;
-
-		  //increased_threads = 1;
-	    if (!blas_server_avail) {
-			// create the kickoff Event
-			kickoff_event = CreateEvent(NULL, TRUE, FALSE, NULL);
-
-			InitializeCriticalSection(&queue_lock);
-
-			blas_server_avail = 1;
-		}
-
-		for (i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++) {
-			//MT_TRACE("set_num_threads: creating thread [%d]\n", i);
-
-			blas_threads[i] = CreateThread(NULL, 0,
-				     blas_thread_server, (void *)i,
-				     0, &blas_threads_id[i]);
-		}
-
-		blas_num_threads = num_threads;
-
-		UNLOCK_COMMAND(&server_lock);
-	}
-
-	blas_cpu_number  = num_threads;
-}
-
-//
-// Openblas function to set thread count
-//
-void openblas_set_num_threads(int num)
-{
-	goto_set_num_threads(num);
-}
-
-static void adjust_thread_buffers() {
-
-  int i=0;
-
-  //adjust buffer for each thread
-  for(i=0; i < blas_cpu_number; i++){
-    if(blas_thread_buffer[i] == NULL){
-      blas_thread_buffer[i] = blas_memory_alloc(2);
-    }
-  }
-  for(; i < MAX_CPU_NUMBER; i++){
-    if(blas_thread_buffer[i] != NULL){
-      blas_memory_free(blas_thread_buffer[i]);
-      blas_thread_buffer[i] = NULL;
-    }
-  }
-}
-
-//Indivitual threads work executor, Helps in setting by synchronization environment and calling inner_threads routine
-static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) {
-  
-  void *buffer, *sa, *sb;
-  
-  buffer = blas_thread_buffer[cpu];
      sa = queue -> sa;
      sb = queue -> sb;

-  int (*routine)(blas_arg_t *, void *, void *, void *, void *, BLASLONG) = queue -> routine;
-
 #ifdef CONSISTENT_FPCSR
      __asm__ __volatile__ ("ldmxcsr %0" : : "m" (queue -> sse_mode));
      __asm__ __volatile__ ("fldcw %0"   : : "m" (queue -> x87_mode));
 #endif

-  MT_TRACE("Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
+#ifdef SMP_DEBUG
+      fprintf(STDERR, "Server[%2ld] Started.  Mode = 0x%03x M = %3ld N=%3ld K=%3ld\n",
 	      cpu, queue->mode, queue-> args ->m, queue->args->n, queue->args->k);
+#endif

      // fprintf(stderr, "queue start[%ld]!!!\n", cpu);

@ -603,8 +267,7 @@ static void exec_threads(int cpu, blas_queue_t *queue, int buf_index) {
      main_status[cpu] = MAIN_RUNNING1;
 #endif

-  if (sa == NULL) 
-    sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);
+      if (sa == NULL) sa = (void *)((BLASLONG)buffer + GEMM_OFFSET_A);

      if (sb == NULL) {
 	if (!(queue -> mode & BLAS_COMPLEX)){
@ -656,9 +319,271 @@ if ((queue -> mode & BLAS_PREC) == BLAS_XDOUBLE){
 #endif

      if (!(queue -> mode & BLAS_LEGACY)) {
+
 	(routine)(queue -> args, queue -> range_m, queue -> range_n, sa, sb, queue -> position);
      } else {
 	legacy_exec(routine, queue -> mode, queue -> args, sb);
      }
-
+    }else{
+		continue; //if queue == NULL
+	}
+
+#ifdef SMP_DEBUG
+    fprintf(STDERR, "Server[%2ld] Finished!\n", cpu);
+#endif
+
+    EnterCriticalSection(&queue->lock);
+
+    queue -> status = BLAS_STATUS_FINISHED;
+
+    LeaveCriticalSection(&queue->lock);
+
+    SetEvent(queue->finish);
+  }
+
+  /* Shutdown procedure */
+
+#ifdef SMP_DEBUG
+  fprintf(STDERR, "Server[%2ld] Shutdown!\n",  cpu);
+#endif
+
+  blas_memory_free(buffer);
+
+  return 0;
+  }
+
+/* Initializing routine */
+int blas_thread_init(void){
+  BLASLONG i;
+
+  if (blas_server_avail || (blas_cpu_number <= 1)) return 0;
+
+  LOCK_COMMAND(&server_lock);
+
+#ifdef SMP_DEBUG
+  fprintf(STDERR, "Initializing Thread(Num. threads = %d)\n",
+	  blas_cpu_number);
+#endif
+
+  if (!blas_server_avail){
+
+    InitializeCriticalSection(&pool.lock);
+    pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
+    pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
+
+    pool.shutdown = 0;
+    pool.queue    = NULL;
+
+    for(i = 0; i < blas_cpu_number - 1; i++){
+      blas_threads[i] = CreateThread(NULL, 0,
+				     blas_thread_server, (void *)i,
+				     0, &blas_threads_id[i]);
+    }
+
+    blas_server_avail = 1;
+  }
+
+  UNLOCK_COMMAND(&server_lock);
+
+  return 0;
+}
+
+/*
+   User can call one of two routines.
+
+     exec_blas_async ... immediately returns after jobs are queued.
+
+     exec_blas       ... returns after jobs are finished.
+*/
+
+int exec_blas_async(BLASLONG pos, blas_queue_t *queue){
+
+#if defined(SMP_SERVER)
+  // Handle lazy re-init of the thread-pool after a POSIX fork
+  // on Cygwin or as delayed init when a static library	is used
+  if (unlikely(blas_server_avail == 0)) blas_thread_init();
+#endif
+
+  blas_queue_t *current;
+
+  current = queue;
+
+  while (current) {
+    InitializeCriticalSection(&current -> lock);
+    current -> finish = CreateEvent(NULL, FALSE, FALSE, NULL);
+    current -> position = pos;
+
+#ifdef CONSISTENT_FPCSR
+    __asm__ __volatile__ ("fnstcw %0"  : "=m" (current -> x87_mode));
+    __asm__ __volatile__ ("stmxcsr %0" : "=m" (current -> sse_mode));
+#endif
+
+    current = current -> next;
+    pos ++;
+  }
+
+  EnterCriticalSection(&pool.lock);
+
+  if (pool.queue) {
+    current = pool.queue;
+    while (current -> next) current = current -> next;
+    current -> next = queue;
+  } else {
+    pool.queue = queue;
+  }
+
+  LeaveCriticalSection(&pool.lock);
+
+  SetEvent(pool.filled);
+
+  return 0;
+}
+
+int exec_blas_async_wait(BLASLONG num, blas_queue_t *queue){
+
+#ifdef SMP_DEBUG
+    fprintf(STDERR, "Synchronization Waiting.\n");
+#endif
+
+    while (num){
+#ifdef SMP_DEBUG
+    fprintf(STDERR, "Waiting Queue ..\n");
+#endif
+
+      WaitForSingleObject(queue->finish, INFINITE);
+
+      CloseHandle(queue->finish);
+      DeleteCriticalSection(&queue -> lock);
+
+      queue = queue -> next;
+      num --;
+    }
+
+#ifdef SMP_DEBUG
+    fprintf(STDERR, "Completely Done.\n\n");
+#endif
+
+  return 0;
+}
+
+/* Execute Threads */
+int exec_blas(BLASLONG num, blas_queue_t *queue){
+
+#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
+  // Handle lazy re-init of the thread-pool after a POSIX fork
+  if (unlikely(blas_server_avail == 0)) blas_thread_init();
+#endif
+
+#ifndef ALL_THREADED
+   int (*routine)(blas_arg_t *, void *, void *, double *, double *, BLASLONG);
+#endif
+
+  if ((num <= 0) || (queue == NULL)) return 0;
+
+  if ((num > 1) && queue -> next) exec_blas_async(1, queue -> next);
+
+  routine = queue -> routine;
+
+  if (queue -> mode & BLAS_LEGACY) {
+    legacy_exec(routine, queue -> mode, queue -> args, queue -> sb);
+  } else
+    if (queue -> mode & BLAS_PTHREAD) {
+      void (*pthreadcompat)(void *) = queue -> routine;
+      (pthreadcompat)(queue -> args);
+    } else
+      (routine)(queue -> args, queue -> range_m, queue -> range_n,
+		queue -> sa, queue -> sb, 0);
+
+  if ((num > 1) && queue -> next) exec_blas_async_wait(num - 1, queue -> next);
+
+  return 0;
+}
+
+/* Shutdown procedure, but user don't have to call this routine. The */
+/* kernel automatically kill threads.                                */
+
+int BLASFUNC(blas_thread_shutdown)(void){
+
+  int i;
+
+  if (!blas_server_avail) return 0;
+
+  LOCK_COMMAND(&server_lock);
+
+  if (blas_server_avail){
+
+    SetEvent(pool.killed);
+
+    for(i = 0; i < blas_num_threads - 1; i++){
+      // Could also just use WaitForMultipleObjects
+      DWORD wait_thread_value = WaitForSingleObject(blas_threads[i], 50);
+
+#ifndef OS_WINDOWSSTORE
+      // TerminateThread is only available with WINAPI_DESKTOP and WINAPI_SYSTEM not WINAPI_APP in UWP
+      if (WAIT_OBJECT_0 != wait_thread_value) {
+        TerminateThread(blas_threads[i],0);
+      }
+#endif
+
+      CloseHandle(blas_threads[i]);
+    }
+
+    CloseHandle(pool.filled);
+    CloseHandle(pool.killed);
+
+    blas_server_avail = 0;
+  }
+
+  UNLOCK_COMMAND(&server_lock);
+
+  return 0;
+}
+
+void goto_set_num_threads(int num_threads)
+{
+	long i;
+
+#if defined(SMP_SERVER) && defined(OS_CYGWIN_NT)
+	// Handle lazy re-init of the thread-pool after a POSIX fork
+	if (unlikely(blas_server_avail == 0)) blas_thread_init();
+#endif
+
+	if (num_threads < 1) num_threads = blas_cpu_number;
+
+	if (num_threads > MAX_CPU_NUMBER) num_threads = MAX_CPU_NUMBER;
+
+	if (num_threads > blas_num_threads) {
+
+		LOCK_COMMAND(&server_lock);
+
+		//increased_threads = 1;
+	    if (!blas_server_avail){
+
+			InitializeCriticalSection(&pool.lock);
+			pool.filled = CreateEvent(NULL, FALSE, FALSE, NULL);
+			pool.killed = CreateEvent(NULL, TRUE,  FALSE, NULL);
+
+			pool.shutdown = 0;
+			pool.queue    = NULL;
+			blas_server_avail = 1;
+		}
+
+		for(i = (blas_num_threads > 0) ? blas_num_threads - 1 : 0; i < num_threads - 1; i++){
+
+			blas_threads[i] = CreateThread(NULL, 0,
+				     blas_thread_server, (void *)i,
+				     0, &blas_threads_id[i]);
+		}
+
+		blas_num_threads = num_threads;
+
+		UNLOCK_COMMAND(&server_lock);
+	}
+
+	blas_cpu_number  = num_threads;
+}
+
+void openblas_set_num_threads(int num)
+{
+	goto_set_num_threads(num);
 }
--- a/driver/others/memory.c
+++ b/driver/others/memory.c
@ -2769,7 +2769,7 @@ void *blas_memory_alloc(int procpos){
 #ifdef ALLOC_DEVICEDRIVER
    alloc_devicedirver,
 #endif
-#ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB)
+#if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)
    alloc_shm,
 #endif
 #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX  || defined OS_AIX  || defined __sun__  || defined OS_WINDOWS))
--- a/interface/gemm.c
+++ b/interface/gemm.c
@ -498,7 +498,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
 	 args.m, args.n, args.k, args.lda, args.ldb, args.ldc);
 #endif

-#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX)
+#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16)
  // Check if we can convert GEMM -> GEMV
  if (args.k != 0) {
    if (args.n == 1) {
--- a/kernel/Makefile.L3
+++ b/kernel/Makefile.L3
@ -17,15 +17,6 @@ ifeq ($(ARCH), ia64)
 USE_GEMM3M = 1
 endif

-ifneq ($(DYNAMIC_ARCH), 1)
-ifeq ($(TARGET), GENERIC)
-USE_GEMM3M = 0
-endif
-else
-ifeq ($(CORE), GENERIC)
-USE_GEMM3M = 0
-endif
-endif

 ifeq ($(ARCH), arm)
 USE_TRMM = 1
--- a/kernel/generic/zgemm3mkernel_dump.c
+++ b/kernel/generic/zgemm3mkernel_dump.c
@ -25,10 +25,16 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/
+#if 1

+#include "zgemmkernel_2x2.c"
+
+
+#else
 #include "common.h"

 int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc)
 {
  return 0;
 }
+#endif
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@ -242,4 +242,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
            }
        }
    }
+    return(0);
 }
--- a/kernel/x86_64/sscal.c
+++ b/kernel/x86_64/sscal.c
@ -200,4 +200,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
            }
        }
    }
+    return(0);
 }
--- a/lapack-netlib/LAPACKE/include/lapack.h
+++ b/lapack-netlib/LAPACKE/include/lapack.h
--- a/lapack-netlib/SRC/cgehrd.f
+++ b/lapack-netlib/SRC/cgehrd.f
@ -163,7 +163,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO )
+      SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK,
+     $                   INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@ -193,7 +194,8 @@
      COMPLEX            EI
 *     ..
 *     .. External Subroutines ..
-      EXTERNAL           CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB, CTRMM,
+      EXTERNAL           CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB,
+     $                   CTRMM,
     $                   XERBLA
 *     ..
 *     .. Intrinsic Functions ..
@ -230,7 +232,7 @@
         IF( NH.LE.1 ) THEN
            LWKOPT = 1
         ELSE
-            NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI,
+            NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI,
     $                              -1 ) )
            LWKOPT = N*NB + TSIZE
         END IF
--- a/lapack-netlib/SRC/chetrd.f
+++ b/lapack-netlib/SRC/chetrd.f
@ -139,7 +139,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complexHEcomputational
+*> \ingroup hetrd
 *
 *> \par Further Details:
 *  =====================
@ -188,7 +188,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO )
+      SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
+     $                   INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@ -225,7 +226,8 @@
 *     .. External Functions ..
      LOGICAL            LSAME
      INTEGER            ILAENV
-      EXTERNAL           LSAME, ILAENV
+      REAL               SROUNDUP_LWORK
+      EXTERNAL           LSAME, ILAENV, SROUNDUP_LWORK
 *     ..
 *     .. Executable Statements ..
 *
@ -249,8 +251,8 @@
 *        Determine the block size.
 *
         NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
-         WORK( 1 ) = LWKOPT
+         LWKOPT = MAX( 1, N*NB )
+         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
      END IF
 *
      IF( INFO.NE.0 ) THEN
@ -367,7 +369,7 @@
     $                TAU( I ), IINFO )
      END IF
 *
-      WORK( 1 ) = LWKOPT
+      WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
      RETURN
 *
 *     End of CHETRD
--- a/lapack-netlib/SRC/dlanv2.f
+++ b/lapack-netlib/SRC/dlanv2.f
@ -109,7 +109,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleOTHERauxiliary
+*> \ingroup lanv2
 *
 *> \par Further Details:
 *  =====================
@ -248,10 +248,14 @@
 *
 *           Compute [ A  B ] = [ CS  SN ] [ AA  BB ]
 *                   [ C  D ]   [-SN  CS ] [ CC  DD ]
+*
+*           Note: Some of the multiplications are wrapped in parentheses to
+*                 prevent compilers from using FMA instructions. See
+*                 https://github.com/Reference-LAPACK/lapack/issues/1031.
 *
            A = AA*CS + CC*SN
-            B = BB*CS + DD*SN
-            C = -AA*SN + CC*CS
+            B = ( BB*CS ) + ( DD*SN )
+            C = -( AA*SN ) + ( CC*CS )
            D = -BB*SN + DD*CS
 *
            TEMP = HALF*( A+D )
--- a/lapack-netlib/SRC/sgelqt.f
+++ b/lapack-netlib/SRC/sgelqt.f
@ -18,7 +18,7 @@
 *>
 *> \verbatim
 *>
-*> DGELQT computes a blocked LQ factorization of a real M-by-N matrix A
+*> SGELQT computes a blocked LQ factorization of a real M-by-N matrix A
 *> using the compact WY representation of Q.
 *> \endverbatim
 *
@ -93,7 +93,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gelqt
 *
 *> \par Further Details:
 *  =====================
--- a/lapack-netlib/SRC/sgemlq.f
+++ b/lapack-netlib/SRC/sgemlq.f
@ -74,7 +74,7 @@
 *>          A is REAL array, dimension
 *>                               (LDA,M) if SIDE = 'L',
 *>                               (LDA,N) if SIDE = 'R'
-*>          Part of the data structure to represent Q as returned by DGELQ.
+*>          Part of the data structure to represent Q as returned by SGELQ.
 *> \endverbatim
 *>
 *> \param[in] LDA
--- a/lapack-netlib/SRC/sgemlqt.f
+++ b/lapack-netlib/SRC/sgemlqt.f
@ -20,7 +20,7 @@
 *>
 *> \verbatim
 *>
-*> DGEMLQT overwrites the general real M-by-N matrix C with
+*> SGEMLQT overwrites the general real M-by-N matrix C with
 *>
 *>                 SIDE = 'L'     SIDE = 'R'
 *> TRANS = 'N':      Q C            C Q
@ -145,7 +145,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup doubleGEcomputational
+*> \ingroup gemlqt
 *
 *  =====================================================================
      SUBROUTINE SGEMLQT( SIDE, TRANS, M, N, K, MB, V, LDV, T, LDT,
--- a/lapack-netlib/SRC/slanv2.f
+++ b/lapack-netlib/SRC/slanv2.f
@ -109,7 +109,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup realOTHERauxiliary
+*> \ingroup lanv2
 *
 *> \par Further Details:
 *  =====================
@ -248,10 +248,14 @@
 *
 *           Compute [ A  B ] = [ CS  SN ] [ AA  BB ]
 *                   [ C  D ]   [-SN  CS ] [ CC  DD ]
+*
+*           Note: Some of the multiplications are wrapped in parentheses to
+*                 prevent compilers from using FMA instructions. See
+*                 https://github.com/Reference-LAPACK/lapack/issues/1031.
 *
            A = AA*CS + CC*SN
-            B = BB*CS + DD*SN
-            C = -AA*SN + CC*CS
+            B = ( BB*CS ) + ( DD*SN )
+            C = -( AA*SN ) + ( CC*CS )
            D = -BB*SN + DD*CS
 *
            TEMP = HALF*( A+D )
--- a/lapack-netlib/SRC/ssytrd.f
+++ b/lapack-netlib/SRC/ssytrd.f
@ -188,7 +188,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO )
+      SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
+     $                   INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@ -248,7 +249,7 @@
 *        Determine the block size.
 *
         NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
         WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
      END IF
 *
@ -316,7 +317,8 @@
 *           Update the unreduced submatrix A(1:i-1,1:i-1), using an
 *           update of the form:  A := A - V*W**T - W*V**T
 *
-            CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1, I ),
+            CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1,
+     $                   I ),
     $                   LDA, WORK, LDWORK, ONE, A, LDA )
 *
 *           Copy superdiagonal elements back into A, and diagonal
--- a/lapack-netlib/SRC/zhetrd.f
+++ b/lapack-netlib/SRC/zhetrd.f
@ -139,7 +139,7 @@
 *> \author Univ. of Colorado Denver
 *> \author NAG Ltd.
 *
-*> \ingroup complex16HEcomputational
+*> \ingroup hetrd
 *
 *> \par Further Details:
 *  =====================
@ -188,7 +188,8 @@
 *> \endverbatim
 *>
 *  =====================================================================
-      SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO )
+      SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
+     $                   INFO )
 *
 *  -- LAPACK computational routine --
 *  -- LAPACK is a software package provided by Univ. of Tennessee,    --
@ -249,7 +250,7 @@
 *        Determine the block size.
 *
         NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 )
-         LWKOPT = N*NB
+         LWKOPT = MAX( 1, N*NB )
         WORK( 1 ) = LWKOPT
      END IF
 *
--- a/test/Makefile
+++ b/test/Makefile
@ -189,8 +189,11 @@ endif
 endif


+ifeq ($(SUPPORT_GEMM3M),1)
+level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3_3m
+else
 level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
-
+endif

 ifneq ($(CROSS), 1)
 	rm -f ?BLAT3.SUMM