Merge branch 'develop' into betterPowerGEMVTail

This commit is contained in:
Chip Kerchner 2024-08-14 10:52:46 -05:00
commit 75472b830a
28 changed files with 1845 additions and 1745 deletions

View File

@ -8,7 +8,7 @@ project(OpenBLAS C ASM)
set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MAJOR_VERSION 0)
set(OpenBLAS_MINOR_VERSION 3) set(OpenBLAS_MINOR_VERSION 3)
set(OpenBLAS_PATCH_VERSION 27.dev) set(OpenBLAS_PATCH_VERSION 28.dev)
set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}")
@ -22,6 +22,8 @@ option(BUILD_WITHOUT_LAPACK "Do not build LAPACK and LAPACKE (Only BLAS or CBLAS
option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON) option(BUILD_LAPACK_DEPRECATED "When building LAPACK, include also some older, deprecated routines" ON)
set(LAPACK_STRLEN "" CACHE STRING "When building LAPACK, use this type (e.g. \"int\") for character lengths (defaults to size_t)")
option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON) option(BUILD_TESTING "Build LAPACK testsuite when building LAPACK" ON)
option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF) option(BUILD_BENCHMARKS "Build the collection of BLAS/LAPACK benchmarks" OFF)
@ -30,7 +32,7 @@ option(C_LAPACK "Build LAPACK from C sources instead of the original Fortran" OF
option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF) option(BUILD_WITHOUT_CBLAS "Do not build the C interface (CBLAS) to the BLAS functions" OFF)
option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64 or ppc only)" OFF) option(DYNAMIC_ARCH "Include support for multiple CPU targets, with automatic selection at runtime (x86/x86_64, aarch64, ppc or RISCV64-RVV1.0 only)" OFF)
option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF) option(DYNAMIC_OLDER "Include specific support for older x86 cpu models (Penryn,Dunnington,Atom,Nano,Opteron) with DYNAMIC_ARCH" OFF)
@ -256,6 +258,10 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "AIX|Android|Linux|FreeBSD|OpenBSD|NetBSD|Drago
endif() endif()
endif() endif()
if (APPLE AND BUILD_SHARED_LIBS)
set(CMAKE_MACOSX_RPATH ON)
endif()
# Seems that this hack doesn't required since macOS 11 Big Sur # Seems that this hack doesn't required since macOS 11 Big Sur
if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20) if (APPLE AND BUILD_SHARED_LIBS AND CMAKE_HOST_SYSTEM_VERSION VERSION_LESS 20)
set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1) set (CMAKE_C_USE_RESPONSE_FILE_FOR_OBJECTS 1)

View File

@ -1,4 +1,127 @@
OpenBLAS ChangeLog OpenBLAS ChangeLog
====================================================================
Version 0.3.28
8-Aug-2024
general:
- Reworked the unfinished implementation of HUGETLB from GotoBLAS
for allocating huge memory pages as buffers on suitable systems
- Changed the unfinished implementation of GEMM3M for the generic
target on all architectures to at least forward to regular GEMM
- Improved multithreaded GEMM performance for large non-skinny matrices
- Improved BLAS3 performance on larger multicore systems through improved
parallelism
- Improved performance of the initial memory allocation by reducing
locking overhead
- Improved performance of GBMV at small problem sizes by introducing
a size barrier for the switch to multithreading
- Added an implementation of the CBLAS_GEMM_BATCH extension
- Fixed miscompilation of CAXPYC and ZAXPYC on all architectures in
CMAKE builds (error introduced in 0.3.27)
- Fixed corner cases involving the handling of NAN and INFINITY
arguments in ?SCAL on all architectures
- Added support for cross-compiling to WEBM with CMAKE (in addition
to the already present makefile support)
- Fixed NAN handling and potential accuracy issues in compilations with
Intel ICX by supplying a suitable fp-model option by default
- The contents of the github project wiki have been converted into
a new set of documentation included with the source code.
- It is now possible to register a callback function that replaces
the built-in support for multithreading with an external backend
like TBB (openblas_set_threads_callback_function)
- Fixed potential duplication of suffixes in shared library naming
- Improved C compiler detection by the build system to tolerate more
naming variants for gcc builds
- Fixed an unnecessary dependency of the utest on CBLAS
- Fixed spurious error reports from the BLAS extensions utest
- Fixed unwanted invocation of the GEMM3M tests in cross-compilation
- Fixed a flaw in the makefile build that could lead to the pkgconfig
file containing an entry of UNKNOWN for the target cpu after installing
- Integrated fixes from the Reference-LAPACK project:
- Fixed uninitialized variables in the LAPACK tests for ?QP3RK (PR 961)
- Fixed potential bounds error in ?UNHR_COL/?ORHR_COL (PR 1018)
- Fixed potential infinite loop in the LAPACK testsuite (PR 1024)
- Make the variable type used for hidden length arguments configurable (PR 1025)
- Fixed SYTRD workspace computation and various typos (PR 1030)
- Prevent compiler use of FMA that could increase numerical error in ?GEEVX (PR 1033)
x86-64:
- reverted thread management under Windows to its state before 0.3.26
due to signs of race conditions in some circumstances now under study
- fixed accidental selection of the unoptimized generic SBGEMM kernel
in CMAKE builds for CooperLake and SapphireRapids targets
- fixed a potential thread buffer overrun in SBSTOBF16 on small systems
- fixed an accuracy issue in ZSCAL introduced in 0.3.26
- fixed compilation with CMAKE and recent releases of LLVM
- added support for Intel Emerald Rapids and Meteor Lake cpus
- added autodetection support for the Zhaoxin KX-7000 cpu
- fixed autodetection of Intel Prescott (probably broken since 0.3.19)
- fixed compilation for older targets with the Yocto SDK
- fixed compilation of the converter-generated C versions
of the LAPACK sources with gcc-14
- improved compiler options when building with CMAKE and LLVM for
AVX512-capable targets
- added support for supplying the L2 cache size via an environment
variable (OPENBLAS_L2_SIZE) in case it is not correctly reported
(as in some VM configurations)
- improved the error message shown when thread creation fails on startup
- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
arm:
- fixed building for baremetal targets with make
arm64:
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
matrix to the corresponding GEMV kernel
- added optimized SGEMV and DGEMV kernels for A64FX
- added optimized SVE kernels for small-matrix GEMM
- added A64FX to the cpu list for DYNAMIC_ARCH
- fixed building with support for cpu affinity
- worked around accuracy problems with C/ZNRM2 on NeoverseN1 and
Apple M targets
- improved GEMM performance on Neoverse V1
- fixed compilation for NEOVERSEN2 with older compilers
- fixed potential miscompilation of the SVE SDOT and DDOT kernels
- fixed potential miscompilation of the non-SVE CDOT and ZDOT kernels
- fixed a potential overflow when using very large user-defined BUFFERSIZE
- fixed setting the rpath entry of the dylib in CMAKE builds on MacOS
power:
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
matrix to the corresponding GEMV kernel
- significantly improved performance of SBGEMM on POWER10
- fixed compilation with OpenMP and the XLF compiler
- fixed building of the BLAS extension utests under AIX
- fixed building of parts of the LAPACK testsuite with XLF
- fixed CSWAP/ZSWAP on big-endian POWER10 targets
- fixed a performance regression in SAXPY on POWER10 with OpenXL
- fixed accuracy issues in CSCAL/ZSCAL when compiled with LLVM
- fixed building for POWER9 under FreeBSD
- fixed a potential overflow when using very large user-defined BUFFERSIZE
- fixed an accuracy issue in the POWER6 kernels for GEMM and GEMV
riscv64:
- Added a fast path forwarding SGEMM and DGEMM calls with a 1xN or Mx1
matrix to the corresponding GEMV kernel
- fixed building for RISCV64_GENERIC with OpenMP enabled
- added DYNAMIC_ARCH support (comprising GENERIC_RISCV64 and the two
RVV 1.0 targets with vector length of 128 and 256)
- worked around the ZVL128B kernels for AXPBY mishandling the special
case of zero Y increment
loongarch64:
- improved GEMM performance on servers of the 3C5000 generation
- improved performance and stability of DGEMM
- improved GEMV and TRSM kernels for LSX and LASX vector ABIs
- fixed CMAKE compilation with the INTERFACE64 option set
- fixed compilation with CMAKE
- worked around spurious errors flagged by the BLAS3 tests
- worked around a miscompilation of the POTRS utest by gcc 14.1
mips64:
- fixed ASUM and SUM kernels to accept negative step sizes in X
- fixed complex GEMV kernels for MSA
==================================================================== ====================================================================
Version 0.3.27 Version 0.3.27
4-Apr-2024 4-Apr-2024

View File

@ -45,6 +45,10 @@ else
LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS)) LAPACK_NOOPT := $(filter-out -O0 -O1 -O2 -O3 -Ofast -O -Og -Os,$(LAPACK_FFLAGS))
endif endif
ifdef LAPACK_STRLEN
LAPACK_FFLAGS += -DLAPACK_STRLEN=$(LAPACK_STRLEN)
endif
SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test SUBDIRS_ALL = $(SUBDIRS) test ctest utest exports benchmark ../laswp ../bench cpp_thread_test
.PHONY : all libs netlib $(RELA) test ctest shared install .PHONY : all libs netlib $(RELA) test ctest shared install

View File

@ -178,7 +178,7 @@ endif
@echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)" @echo 'libnamesuffix='$(LIBNAMESUFFIX) >> "$(PKGFILE)"
@echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)" @echo 'libsuffix='$(SYMBOLSUFFIX) >> "$(PKGFILE)"
@echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)" @echo 'includedir='$(OPENBLAS_INCLUDE_DIR) >> "$(PKGFILE)"
@echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(CORE) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)" @echo 'openblas_config= USE_64BITINT='$(INTERFACE64) 'DYNAMIC_ARCH='$(DYNAMIC_ARCH) 'DYNAMIC_OLDER='$(DYNAMIC_OLDER) 'NO_CBLAS='$(NO_CBLAS) 'NO_LAPACK='$(NO_LAPACK) 'NO_LAPACKE='$(NO_LAPACKE) 'NO_AFFINITY='$(NO_AFFINITY) 'USE_OPENMP='$(USE_OPENMP) $(TARGET) 'MAX_THREADS='$(NUM_THREADS)>> "$(PKGFILE)"
@echo 'version='$(VERSION) >> "$(PKGFILE)" @echo 'version='$(VERSION) >> "$(PKGFILE)"
@echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)" @echo 'extralib='$(PKG_EXTRALIB) >> "$(PKGFILE)"
@cat openblas.pc.in >> "$(PKGFILE)" @cat openblas.pc.in >> "$(PKGFILE)"

View File

@ -3,7 +3,7 @@
# #
# This library's version # This library's version
VERSION = 0.3.27.dev VERSION = 0.3.28.dev
# If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a # If you set this prefix, the library name will be lib$(LIBNAMESUFFIX)openblas.a
# and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library # and lib$(LIBNAMESUFFIX)openblas.so, with a matching soname in the shared library
@ -134,6 +134,12 @@ VERSION = 0.3.27.dev
# Build LAPACK Deprecated functions since LAPACK 3.6.0 # Build LAPACK Deprecated functions since LAPACK 3.6.0
BUILD_LAPACK_DEPRECATED = 1 BUILD_LAPACK_DEPRECATED = 1
# The variable type assumed for the length of character arguments when passing
# data between Fortran LAPACK and C BLAS (defaults to "size_t", but older GCC
# versions used "int"). Mismatches will not cause runtime failures but may result
# in build warnings or errors when building with link-time optimization (LTO)
# LAPACK_STRLEN=int
# Build RecursiveLAPACK on top of LAPACK # Build RecursiveLAPACK on top of LAPACK
# BUILD_RELAPACK = 1 # BUILD_RELAPACK = 1
# Have RecursiveLAPACK actually replace standard LAPACK routines instead of # Have RecursiveLAPACK actually replace standard LAPACK routines instead of

View File

@ -277,6 +277,12 @@ endif
ifeq ($(ARCH), arm64) ifeq ($(ARCH), arm64)
GEMM_GEMV_FORWARD = 1 GEMM_GEMV_FORWARD = 1
endif endif
ifeq ($(ARCH), riscv)
GEMM_GEMV_FORWARD = 1
endif
ifeq ($(ARCH), power)
GEMM_GEMV_FORWARD = 1
endif
ifeq ($(SMALL_MATRIX_OPT), 1) ifeq ($(SMALL_MATRIX_OPT), 1)
CCOMMON_OPT += -DSMALL_MATRIX_OPT CCOMMON_OPT += -DSMALL_MATRIX_OPT

View File

@ -57,7 +57,11 @@ if (DYNAMIC_ARCH)
set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10) set(DYNAMIC_CORE POWER6 POWER8 POWER9 POWER10)
set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT") set(CCOMMON_OPT "${CCOMMON_OPT} -DHAVE_P10_SUPPORT")
endif () endif ()
if (RISCV64)
set(DYNAMIC_CORE RISCV64_GENERIC RISCV64_ZVL128B RISCV64_ZVL256B)
endif ()
if (X86) if (X86)
set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO) set(DYNAMIC_CORE KATMAI COPPERMINE NORTHWOOD PRESCOTT BANIAS CORE2 PENRYN DUNNINGTON NEHALEM ATHLON OPTERON OPTERON_SSE3 BARCELONA BOBCAT ATOM NANO)
endif () endif ()

View File

@ -403,7 +403,7 @@ if (SMALL_MATRIX_OPT)
endif () endif ()
if (DYNAMIC_ARCH) if (DYNAMIC_ARCH)
if (X86 OR X86_64 OR ARM64 OR POWER) if (X86 OR X86_64 OR ARM64 OR POWER OR RISCV64)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_ARCH")
if (DYNAMIC_OLDER) if (DYNAMIC_OLDER)
set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER") set(CCOMMON_OPT "${CCOMMON_OPT} -DDYNAMIC_OLDER")
@ -621,7 +621,10 @@ set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} ${FCOMMON_OPT}")
set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}") set(FPFLAGS "${FPFLAGS} ${FCOMMON_OPT} ${COMMON_PROF}")
#For LAPACK Fortran codes. #For LAPACK Fortran codes.
set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}") set(LAPACK_FFLAGS "${LAPACK_FFLAGS} ${CMAKE_Fortran_FLAGS}" )
if (LAPACK_STRLEN)
set (LAPACK_FFLAGS "${LAPACK_FFLAGS} -DLAPACK_STRLEN=${LAPACK_STRLEN}")
endif()
set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}") set(LAPACK_FPFLAGS "${LAPACK_FPFLAGS} ${FPFLAGS}")
#Disable -fopenmp for LAPACK Fortran codes on Windows. #Disable -fopenmp for LAPACK Fortran codes on Windows.

View File

@ -111,8 +111,8 @@ typedef struct blas_queue {
struct blas_queue *next; struct blas_queue *next;
#if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__) #if defined( __WIN32__) || defined(__CYGWIN32__) || defined(_WIN32) || defined(__CYGWIN__)
// CRITICAL_SECTION lock; CRITICAL_SECTION lock;
// HANDLE finish; HANDLE finish;
volatile int finished; volatile int finished;
#else #else
pthread_mutex_t lock; pthread_mutex_t lock;

View File

@ -52,6 +52,8 @@ if (DYNAMIC_ARCH)
list(APPEND COMMON_SOURCES dynamic_arm64.c) list(APPEND COMMON_SOURCES dynamic_arm64.c)
elseif (POWER) elseif (POWER)
list(APPEND COMMON_SOURCES dynamic_power.c) list(APPEND COMMON_SOURCES dynamic_power.c)
elseif (RISCV64)
list(APPEND COMMON_SOURCES dynamic_riscv64.c detect_riscv64.c)
else () else ()
list(APPEND COMMON_SOURCES dynamic.c) list(APPEND COMMON_SOURCES dynamic.c)
endif () endif ()

File diff suppressed because it is too large Load Diff

View File

@ -2769,7 +2769,7 @@ void *blas_memory_alloc(int procpos){
#ifdef ALLOC_DEVICEDRIVER #ifdef ALLOC_DEVICEDRIVER
alloc_devicedirver, alloc_devicedirver,
#endif #endif
#ifdef ALLOC_SHM && !defined(ALLOC_HUGETLB) #if defined(ALLOC_SHM) && !defined(ALLOC_HUGETLB)
alloc_shm, alloc_shm,
#endif #endif
#if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS)) #if ((defined ALLOC_HUGETLB) && (defined OS_LINUX || defined OS_AIX || defined __sun__ || defined OS_WINDOWS))

View File

@ -498,7 +498,7 @@ void CNAME(enum CBLAS_ORDER order, enum CBLAS_TRANSPOSE TransA, enum CBLAS_TRANS
args.m, args.n, args.k, args.lda, args.ldb, args.ldc); args.m, args.n, args.k, args.lda, args.ldb, args.ldc);
#endif #endif
#if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) #if defined(GEMM_GEMV_FORWARD) && !defined(GEMM3M) && !defined(COMPLEX) && !defined(BFLOAT16)
// Check if we can convert GEMM -> GEMV // Check if we can convert GEMM -> GEMV
if (args.k != 0) { if (args.k != 0) {
if (args.n == 1) { if (args.n == 1) {

View File

@ -17,15 +17,6 @@ ifeq ($(ARCH), ia64)
USE_GEMM3M = 1 USE_GEMM3M = 1
endif endif
ifneq ($(DYNAMIC_ARCH), 1)
ifeq ($(TARGET), GENERIC)
USE_GEMM3M = 0
endif
else
ifeq ($(CORE), GENERIC)
USE_GEMM3M = 0
endif
endif
ifeq ($(ARCH), arm) ifeq ($(ARCH), arm)
USE_TRMM = 1 USE_TRMM = 1

View File

@ -25,10 +25,16 @@ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/ *****************************************************************************/
#if 1
#include "zgemmkernel_2x2.c"
#else
#include "common.h" #include "common.h"
int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc) int CNAME(BLASLONG bm, BLASLONG bn, BLASLONG bk, FLOAT alphar, FLOAT alphai, FLOAT * ba, FLOAT * bb, FLOAT * C, BLASLONG ldc)
{ {
return 0; return 0;
} }
#endif

View File

@ -242,4 +242,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
} }
} }
return(0);
} }

View File

@ -200,4 +200,5 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
} }
} }
} }
return(0);
} }

File diff suppressed because it is too large Load Diff

View File

@ -163,7 +163,8 @@
*> \endverbatim *> \endverbatim
*> *>
* ===================================================================== * =====================================================================
SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK, INFO ) SUBROUTINE CGEHRD( N, ILO, IHI, A, LDA, TAU, WORK, LWORK,
$ INFO )
* *
* -- LAPACK computational routine -- * -- LAPACK computational routine --
* -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- LAPACK is a software package provided by Univ. of Tennessee, --
@ -193,7 +194,8 @@
COMPLEX EI COMPLEX EI
* .. * ..
* .. External Subroutines .. * .. External Subroutines ..
EXTERNAL CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB, CTRMM, EXTERNAL CAXPY, CGEHD2, CGEMM, CLAHR2, CLARFB,
$ CTRMM,
$ XERBLA $ XERBLA
* .. * ..
* .. Intrinsic Functions .. * .. Intrinsic Functions ..
@ -230,7 +232,7 @@
IF( NH.LE.1 ) THEN IF( NH.LE.1 ) THEN
LWKOPT = 1 LWKOPT = 1
ELSE ELSE
NB = MIN( NBMAX, ILAENV( 1, 'DGEHRD', ' ', N, ILO, IHI, NB = MIN( NBMAX, ILAENV( 1, 'CGEHRD', ' ', N, ILO, IHI,
$ -1 ) ) $ -1 ) )
LWKOPT = N*NB + TSIZE LWKOPT = N*NB + TSIZE
END IF END IF

View File

@ -139,7 +139,7 @@
*> \author Univ. of Colorado Denver *> \author Univ. of Colorado Denver
*> \author NAG Ltd. *> \author NAG Ltd.
* *
*> \ingroup complexHEcomputational *> \ingroup hetrd
* *
*> \par Further Details: *> \par Further Details:
* ===================== * =====================
@ -188,7 +188,8 @@
*> \endverbatim *> \endverbatim
*> *>
* ===================================================================== * =====================================================================
SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO ) SUBROUTINE CHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
$ INFO )
* *
* -- LAPACK computational routine -- * -- LAPACK computational routine --
* -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- LAPACK is a software package provided by Univ. of Tennessee, --
@ -225,7 +226,8 @@
* .. External Functions .. * .. External Functions ..
LOGICAL LSAME LOGICAL LSAME
INTEGER ILAENV INTEGER ILAENV
EXTERNAL LSAME, ILAENV REAL SROUNDUP_LWORK
EXTERNAL LSAME, ILAENV, SROUNDUP_LWORK
* .. * ..
* .. Executable Statements .. * .. Executable Statements ..
* *
@ -249,8 +251,8 @@
* Determine the block size. * Determine the block size.
* *
NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 ) NB = ILAENV( 1, 'CHETRD', UPLO, N, -1, -1, -1 )
LWKOPT = N*NB LWKOPT = MAX( 1, N*NB )
WORK( 1 ) = LWKOPT WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
END IF END IF
* *
IF( INFO.NE.0 ) THEN IF( INFO.NE.0 ) THEN
@ -367,7 +369,7 @@
$ TAU( I ), IINFO ) $ TAU( I ), IINFO )
END IF END IF
* *
WORK( 1 ) = LWKOPT WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
RETURN RETURN
* *
* End of CHETRD * End of CHETRD

View File

@ -109,7 +109,7 @@
*> \author Univ. of Colorado Denver *> \author Univ. of Colorado Denver
*> \author NAG Ltd. *> \author NAG Ltd.
* *
*> \ingroup doubleOTHERauxiliary *> \ingroup lanv2
* *
*> \par Further Details: *> \par Further Details:
* ===================== * =====================
@ -144,7 +144,7 @@
* .. * ..
* .. Local Scalars .. * .. Local Scalars ..
DOUBLE PRECISION AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, DOUBLE PRECISION AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB,
$ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN,
$ SAFMN2, SAFMX2 $ SAFMN2, SAFMX2
INTEGER COUNT INTEGER COUNT
* .. * ..
@ -248,10 +248,14 @@
* *
* Compute [ A B ] = [ CS SN ] [ AA BB ] * Compute [ A B ] = [ CS SN ] [ AA BB ]
* [ C D ] [-SN CS ] [ CC DD ] * [ C D ] [-SN CS ] [ CC DD ]
*
* Note: Some of the multiplications are wrapped in parentheses to
* prevent compilers from using FMA instructions. See
* https://github.com/Reference-LAPACK/lapack/issues/1031.
* *
A = AA*CS + CC*SN A = AA*CS + CC*SN
B = BB*CS + DD*SN B = ( BB*CS ) + ( DD*SN )
C = -AA*SN + CC*CS C = -( AA*SN ) + ( CC*CS )
D = -BB*SN + DD*CS D = -BB*SN + DD*CS
* *
TEMP = HALF*( A+D ) TEMP = HALF*( A+D )

View File

@ -18,7 +18,7 @@
*> *>
*> \verbatim *> \verbatim
*> *>
*> DGELQT computes a blocked LQ factorization of a real M-by-N matrix A *> SGELQT computes a blocked LQ factorization of a real M-by-N matrix A
*> using the compact WY representation of Q. *> using the compact WY representation of Q.
*> \endverbatim *> \endverbatim
* *
@ -93,7 +93,7 @@
*> \author Univ. of Colorado Denver *> \author Univ. of Colorado Denver
*> \author NAG Ltd. *> \author NAG Ltd.
* *
*> \ingroup doubleGEcomputational *> \ingroup gelqt
* *
*> \par Further Details: *> \par Further Details:
* ===================== * =====================

View File

@ -74,7 +74,7 @@
*> A is REAL array, dimension *> A is REAL array, dimension
*> (LDA,M) if SIDE = 'L', *> (LDA,M) if SIDE = 'L',
*> (LDA,N) if SIDE = 'R' *> (LDA,N) if SIDE = 'R'
*> Part of the data structure to represent Q as returned by DGELQ. *> Part of the data structure to represent Q as returned by SGELQ.
*> \endverbatim *> \endverbatim
*> *>
*> \param[in] LDA *> \param[in] LDA

View File

@ -20,7 +20,7 @@
*> *>
*> \verbatim *> \verbatim
*> *>
*> DGEMLQT overwrites the general real M-by-N matrix C with *> SGEMLQT overwrites the general real M-by-N matrix C with
*> *>
*> SIDE = 'L' SIDE = 'R' *> SIDE = 'L' SIDE = 'R'
*> TRANS = 'N': Q C C Q *> TRANS = 'N': Q C C Q
@ -145,7 +145,7 @@
*> \author Univ. of Colorado Denver *> \author Univ. of Colorado Denver
*> \author NAG Ltd. *> \author NAG Ltd.
* *
*> \ingroup doubleGEcomputational *> \ingroup gemlqt
* *
* ===================================================================== * =====================================================================
SUBROUTINE SGEMLQT( SIDE, TRANS, M, N, K, MB, V, LDV, T, LDT, SUBROUTINE SGEMLQT( SIDE, TRANS, M, N, K, MB, V, LDV, T, LDT,

View File

@ -109,7 +109,7 @@
*> \author Univ. of Colorado Denver *> \author Univ. of Colorado Denver
*> \author NAG Ltd. *> \author NAG Ltd.
* *
*> \ingroup realOTHERauxiliary *> \ingroup lanv2
* *
*> \par Further Details: *> \par Further Details:
* ===================== * =====================
@ -144,7 +144,7 @@
* .. * ..
* .. Local Scalars .. * .. Local Scalars ..
REAL AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB, REAL AA, BB, BCMAX, BCMIS, CC, CS1, DD, EPS, P, SAB,
$ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN, $ SAC, SCALE, SIGMA, SN1, TAU, TEMP, Z, SAFMIN,
$ SAFMN2, SAFMX2 $ SAFMN2, SAFMX2
INTEGER COUNT INTEGER COUNT
* .. * ..
@ -248,10 +248,14 @@
* *
* Compute [ A B ] = [ CS SN ] [ AA BB ] * Compute [ A B ] = [ CS SN ] [ AA BB ]
* [ C D ] [-SN CS ] [ CC DD ] * [ C D ] [-SN CS ] [ CC DD ]
*
* Note: Some of the multiplications are wrapped in parentheses to
* prevent compilers from using FMA instructions. See
* https://github.com/Reference-LAPACK/lapack/issues/1031.
* *
A = AA*CS + CC*SN A = AA*CS + CC*SN
B = BB*CS + DD*SN B = ( BB*CS ) + ( DD*SN )
C = -AA*SN + CC*CS C = -( AA*SN ) + ( CC*CS )
D = -BB*SN + DD*CS D = -BB*SN + DD*CS
* *
TEMP = HALF*( A+D ) TEMP = HALF*( A+D )

View File

@ -188,7 +188,8 @@
*> \endverbatim *> \endverbatim
*> *>
* ===================================================================== * =====================================================================
SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO ) SUBROUTINE SSYTRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
$ INFO )
* *
* -- LAPACK computational routine -- * -- LAPACK computational routine --
* -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- LAPACK is a software package provided by Univ. of Tennessee, --
@ -248,7 +249,7 @@
* Determine the block size. * Determine the block size.
* *
NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 ) NB = ILAENV( 1, 'SSYTRD', UPLO, N, -1, -1, -1 )
LWKOPT = N*NB LWKOPT = MAX( 1, N*NB )
WORK( 1 ) = SROUNDUP_LWORK(LWKOPT) WORK( 1 ) = SROUNDUP_LWORK(LWKOPT)
END IF END IF
* *
@ -316,7 +317,8 @@
* Update the unreduced submatrix A(1:i-1,1:i-1), using an * Update the unreduced submatrix A(1:i-1,1:i-1), using an
* update of the form: A := A - V*W**T - W*V**T * update of the form: A := A - V*W**T - W*V**T
* *
CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1, I ), CALL SSYR2K( UPLO, 'No transpose', I-1, NB, -ONE, A( 1,
$ I ),
$ LDA, WORK, LDWORK, ONE, A, LDA ) $ LDA, WORK, LDWORK, ONE, A, LDA )
* *
* Copy superdiagonal elements back into A, and diagonal * Copy superdiagonal elements back into A, and diagonal

View File

@ -139,7 +139,7 @@
*> \author Univ. of Colorado Denver *> \author Univ. of Colorado Denver
*> \author NAG Ltd. *> \author NAG Ltd.
* *
*> \ingroup complex16HEcomputational *> \ingroup hetrd
* *
*> \par Further Details: *> \par Further Details:
* ===================== * =====================
@ -188,7 +188,8 @@
*> \endverbatim *> \endverbatim
*> *>
* ===================================================================== * =====================================================================
SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK, INFO ) SUBROUTINE ZHETRD( UPLO, N, A, LDA, D, E, TAU, WORK, LWORK,
$ INFO )
* *
* -- LAPACK computational routine -- * -- LAPACK computational routine --
* -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- LAPACK is a software package provided by Univ. of Tennessee, --
@ -249,7 +250,7 @@
* Determine the block size. * Determine the block size.
* *
NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 ) NB = ILAENV( 1, 'ZHETRD', UPLO, N, -1, -1, -1 )
LWKOPT = N*NB LWKOPT = MAX( 1, N*NB )
WORK( 1 ) = LWKOPT WORK( 1 ) = LWKOPT
END IF END IF
* *

View File

@ -189,8 +189,11 @@ endif
endif endif
ifeq ($(SUPPORT_GEMM3M),1)
level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3_3m
else
level3: $(B3) $(S3) $(D3) $(C3) $(Z3) level3: $(B3) $(S3) $(D3) $(C3) $(Z3)
endif
ifneq ($(CROSS), 1) ifneq ($(CROSS), 1)
rm -f ?BLAT3.SUMM rm -f ?BLAT3.SUMM
@ -263,7 +266,7 @@ endif
endif endif
level3_3m : zblat3_3m cblat3_3m level3_3m: zblat3_3m cblat3_3m
ifneq ($(CROSS), 1) ifneq ($(CROSS), 1)
rm -f ?BLAT3_3M.SUMM rm -f ?BLAT3_3M.SUMM
OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat OPENBLAS_NUM_THREADS=1 OMP_NUM_THREADS=1 ./cblat3_3m < ./cblat3_3m.dat