diff --git a/.travis.yml b/.travis.yml index bde0e202d..2a221e3bd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -224,12 +224,21 @@ matrix: before_script: - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" - brew update - - brew install gcc@10 script: - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE env: - - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" - + - BTYPE="TARGET=HASWELL USE_OPENMP=1 BINARY=64 INTERFACE64=1 CC=gcc-10 FC=gfortran-10" + + - <<: *test-macos + osx_image: xcode12 + before_script: + - COMMON_FLAGS="DYNAMIC_ARCH=1 NUM_THREADS=32" + - brew update + script: + - travis_wait 45 make QUIET_MAKE=1 $COMMON_FLAGS $BTYPE + env: + - BTYPE="TARGET=NEHALEM BINARY=64 INTERFACE64=1 FC=gfortran-10" + # - <<: *test-macos # osx_image: xcode10 # env: diff --git a/CMakeLists.txt b/CMakeLists.txt index 3107ef9a9..d0313c842 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -6,7 +6,7 @@ cmake_minimum_required(VERSION 2.8.5) project(OpenBLAS C ASM) set(OpenBLAS_MAJOR_VERSION 0) set(OpenBLAS_MINOR_VERSION 3) -set(OpenBLAS_PATCH_VERSION 14) +set(OpenBLAS_PATCH_VERSION 14.dev) set(OpenBLAS_VERSION "${OpenBLAS_MAJOR_VERSION}.${OpenBLAS_MINOR_VERSION}.${OpenBLAS_PATCH_VERSION}") # Adhere to GNU filesystem layout conventions diff --git a/Changelog.txt b/Changelog.txt index 5662bc5c6..6c5cf573e 100644 --- a/Changelog.txt +++ b/Changelog.txt @@ -1,4 +1,54 @@ OpenBLAS ChangeLog +==================================================================== +Version 0.3.15 + 2-May-2021 + +common: + - imported improvements and bugfixes from Reference-LAPACK 3.9.1 + - imported LAPACKE interface fixes from Reference-LAPACK PRs 534 + 537 + - fixed a problem in the cpu detection of 0.3.14 that prevented cross-compilation + - fixed a sequence problem in the generation of softlinks to the library in GMAKE + +RISC V: + - fixed compilation on RISCV (missing entry in getarch) + - fixed a potential division by zero in CROTG and ZROTG + +POWER: + - fixed LAPACK testsuite failures seen with the NVIDIA HPC compiler + - improved CGEMM, DGEMM and ZGEMM performance on POWER10 + - added an optimized ZGEMV kernel for POWER10 + - fixed a potential division by zero in CROTG and ZROTG + +x86_64: + - added support for Intel Control-flow Enforcement Technology (CET) + - reverted the DOMATCOPY_RT code to the generic C version + - fixed a bug in the AVX512 SGEMM kernel introduced in 0.3.14 + - fixed misapplication of -msse flag to non-SSE cpus in DYNAMIC_ARCH + - added support for compilation of the benchmarks on older OSX versions + - fix propagation of the NO_AVX512 option in CMAKE builds + - fix compilation of the AVX512 SGEMM kernel with clang-cl on Windows + - fixed compilation of the CTESTs with INTERFACE64=1 (random faults on OSX) + - corrected the Haswell DROT kernel to require AVX2/FMA3 rather than AVX512 + +ARM: + - fixed a potential division by zero in CROTG and ZROTG + - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs + +ARM64: + - fixed spurious reads outside the array in the SGEMM tcopy macro + - fixed a potential division by zero in CROTG and ZROTG + - fixed a segmentation fault in DYNAMIC_ARCH builds (reappeared in 0.3.14) + +MIPS + - fixed a potential division by zero in CROTG and ZROTG + - fixed a potential overflow in IMATCOPY/ZIMATCOPY and the CTESTs + +MIPS64: + - fixed a potential division by zero in CROTG and ZROTG + +SPARC: + - fixed a potential division by zero in CROTG and ZROTG + ==================================================================== Version 0.3.14 17-Mar-2021 diff --git a/Makefile b/Makefile index de0735c4a..555d1c467 100644 --- a/Makefile +++ b/Makefile @@ -167,7 +167,6 @@ ifeq ($(NO_SHARED), 1) $(error OpenBLAS: neither static nor shared are enabled.) endif endif - @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @for d in $(SUBDIRS) ; \ do if test -d $$d; then \ $(MAKE) -C $$d $(@F) || exit 1 ; \ @@ -196,6 +195,7 @@ endif ifdef USE_THREAD @echo USE_THREAD=$(USE_THREAD) >> Makefile.conf_last endif + @-ln -fs $(LIBNAME) $(LIBPREFIX).$(LIBSUFFIX) @touch lib.grd prof : prof_blas prof_lapack diff --git a/Makefile.rule b/Makefile.rule index 5a46bf6b0..38d0161a3 100644 --- a/Makefile.rule +++ b/Makefile.rule @@ -3,7 +3,7 @@ # # This library's version -VERSION = 0.3.14 +VERSION = 0.3.14.dev # If you set the suffix, the library name will be libopenblas_$(LIBNAMESUFFIX).a # and libopenblas_$(LIBNAMESUFFIX).so. Meanwhile, the soname in shared library diff --git a/Makefile.x86 b/Makefile.x86 index 0e27264d8..893379c33 100644 --- a/Makefile.x86 +++ b/Makefile.x86 @@ -1,10 +1,21 @@ # COMPILER_PREFIX = mingw32- -ifdef HAVE_SSE -CCOMMON_OPT += -msse -FCOMMON_OPT += -msse +ifndef DYNAMIC_ARCH +ADD_CPUFLAGS = 1 +else +ifdef TARGET_CORE +ADD_CPUFLAGS = 1 +endif endif +ifdef ADD_CPUFLAGS +ifdef HAVE_SSE +CCOMMON_OPT += -msse +ifneq ($(F_COMPILER), NAG) +FCOMMON_OPT += -msse +endif +endif +endif ifeq ($(OSNAME), Interix) ARFLAGS = -m x86 diff --git a/Makefile.x86_64 b/Makefile.x86_64 index 5406494c9..f62ab9e5e 100644 --- a/Makefile.x86_64 +++ b/Makefile.x86_64 @@ -8,6 +8,16 @@ endif endif endif + +ifndef DYNAMIC_ARCH +ADD_CPUFLAGS = 1 +else +ifdef TARGET_CORE +ADD_CPUFLAGS = 1 +endif +endif + +ifdef ADD_CPUFLAGS ifdef HAVE_SSE3 CCOMMON_OPT += -msse3 ifneq ($(F_COMPILER), NAG) @@ -44,7 +54,6 @@ endif endif ifeq ($(CORE), SKYLAKEX) -ifndef DYNAMIC_ARCH ifndef NO_AVX512 CCOMMON_OPT += -march=skylake-avx512 ifneq ($(F_COMPILER), NAG) @@ -62,10 +71,8 @@ endif endif endif endif -endif ifeq ($(CORE), COOPERLAKE) -ifndef DYNAMIC_ARCH ifndef NO_AVX512 ifeq ($(C_COMPILER), GCC) # cooperlake support was added in 10.1 @@ -88,7 +95,6 @@ endif endif endif endif -endif ifdef HAVE_AVX2 ifndef NO_AVX2 @@ -120,6 +126,7 @@ endif endif endif +endif ifeq ($(OSNAME), Interix) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 639cb3558..56a3fd4ae 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -4,7 +4,15 @@ trigger: branches: include: - develop - +resources: + containers: + - container: oneapi-hpckit + image: intel/oneapi-hpckit:latest + options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so' + - container: oneapi-basekit + image: intel/oneapi-basekit:latest + options: '-v /usr/bin/sudo:/usr/bin/sudo -v /usr/lib/sudo/libsudo_util.so.0:/usr/lib/sudo/libsudo_util.so.0 -v /usr/lib/sudo/sudoers.so:/usr/lib/sudo/sudoers.so' + jobs: # manylinux1 is useful to test because the # standard Docker container uses an old version @@ -68,4 +76,64 @@ jobs: dir openblas_utest.exe - +- job: OSX_OpenMP + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + brew update + make TARGET=CORE2 DYNAMIC_ARCH=1 USE_OPENMP=1 INTERFACE64=1 CC=gcc-10 FC=gfortran-10 + +- job: OSX_GCC_Nothreads + pool: + vmImage: 'macOS-10.15' + steps: + - script: | + brew update + make USE_THREADS=0 CC=gcc-10 FC=gfortran-10 + +- job: OSX_OpenMP_Clang + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + LIBRARY_PATH: /usr/local/opt/llvm/lib + steps: + - script: | + brew update + brew install llvm libomp + make TARGET=CORE2 USE_OPENMP=1 INTERFACE64=1 DYNAMIC_ARCH=1 CC=/usr/local/opt/llvm/bin/clang FC=gfortran-10 + +- job: OSX_Ifort_Clang + pool: + vmImage: 'macOS-10.15' + variables: + LD_LIBRARY_PATH: /usr/local/opt/llvm/lib + MACOS_HPCKIT_URL: https://registrationcenter-download.intel.com/akdlm/irc_nas/17643/m_HPCKit_p_2021.2.0.2903_offline.dmg + LIBRARY_PATH: /usr/local/opt/llvm/lib + MACOS_FORTRAN_COMPONENTS: intel.oneapi.mac.ifort-compiler + steps: + - script: | + brew update + brew install llvm libomp + sudo mkdir -p /opt/intel + sudo chown $USER /opt/intel + displayName: prepare for cache restore + - task: Cache@2 + inputs: + path: /opt/intel/oneapi + key: '"install" | "$(MACOS_HPCKIT_URL)" | "$(MACOS_FORTRAN_COMPONENTS)"' + cacheHitVar: CACHE_RESTORED + - script: | + curl --output webimage.dmg --url $(MACOS_HPCKIT_URL) --retry 5 --retry-delay 5 + hdiutil attach webimage.dmg + sudo /Volumes/"$(basename "$(MACOS_HPCKIT_URL)" .dmg)"/bootstrapper.app/Contents/MacOS/bootstrapper -s --action install --components="$(MACOS_FORTRAN_COMPONENTS)" --eula=accept --continue-with-optional-error=yes --log-dir=. + installer_exit_code=$? + hdiutil detach /Volumes/"$(basename "$URL" .dmg)" -quiet + exit $installer_exit_code + displayName: install + condition: ne(variables.CACHE_RESTORED, 'true') + - script: | + source /opt/intel/oneapi/setvars.sh + make CC=/usr/local/opt/llvm/bin/clang FC=ifort + diff --git a/benchmark/bench.h b/benchmark/bench.h index 83de8ab2b..c03d72bef 100644 --- a/benchmark/bench.h +++ b/benchmark/bench.h @@ -3,6 +3,8 @@ #include #ifdef __CYGWIN32__ #include +#elif defined(__APPLE__) +#include #endif #include "common.h" diff --git a/cmake/lapack.cmake b/cmake/lapack.cmake index 73f2592ef..0e45d4c63 100644 --- a/cmake/lapack.cmake +++ b/cmake/lapack.cmake @@ -66,7 +66,7 @@ set(SLASRC slaqgb.f slaqge.f slaqp2.f slaqps.f slaqsb.f slaqsp.f slaqsy.f slaqr0.f slaqr1.f slaqr2.f slaqr3.f slaqr4.f slaqr5.f slaqtr.f slar1v.f slar2v.f ilaslr.f ilaslc.f - slarf.f slarfb.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f + slarf.f slarfb.f slarfb_gett.f slarfg.f slarfgp.f slarft.f slarfx.f slarfy.f slargv.f slarrv.f slartv.f slarz.f slarzb.f slarzt.f slasy2.f slasyf.f slasyf_rook.f slasyf_rk.f slasyf_aa.f @@ -112,14 +112,14 @@ set(SLASRC sgeqrt.f sgeqrt2.f sgeqrt3.f sgemqrt.f stpqrt.f stpqrt2.f stpmqrt.f stprfb.f sgelqt.f sgelqt3.f sgemlqt.f - sgetsls.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f + sgetsls.f sgetsqrhrt.f sgeqr.f slatsqr.f slamtsqr.f sgemqr.f sgelq.f slaswlq.f slamswlq.f sgemlq.f stplqt.f stplqt2.f stpmlqt.f ssytrd_2stage.f ssytrd_sy2sb.f ssytrd_sb2st.F ssb2st_kernels.f ssyevd_2stage.f ssyev_2stage.f ssyevx_2stage.f ssyevr_2stage.f ssbev_2stage.f ssbevx_2stage.f ssbevd_2stage.f ssygv_2stage.f sgesvdq.f slaorhr_col_getrfnp.f - slaorhr_col_getrfnp2.f sorgtsqr.f sorhr_col.f ) + slaorhr_col_getrfnp2.f sorgtsqr.f sorgtsqr_row.f sorhr_col.f ) set(SXLASRC sgesvxx.f sgerfsx.f sla_gerfsx_extended.f sla_geamv.f sla_gercond.f sla_gerpvgrw.f ssysvxx.f ssyrfsx.f @@ -171,7 +171,7 @@ set(CLASRC claqhb.f claqhe.f claqhp.f claqp2.f claqps.f claqsb.f claqr0.f claqr1.f claqr2.f claqr3.f claqr4.f claqr5.f claqsp.f claqsy.f clar1v.f clar2v.f ilaclr.f ilaclc.f - clarf.f clarfb.f clarfg.f clarfgp.f clarft.f + clarf.f clarfb.f clarfb_gett.f clarfg.f clarfgp.f clarft.f clarfx.f clarfy.f clargv.f clarnv.f clarrv.f clartg.f clartv.f clarz.f clarzb.f clarzt.f clascl.f claset.f clasr.f classq.f clasyf.f clasyf_rook.f clasyf_rk.f clasyf_aa.f @@ -209,14 +209,14 @@ set(CLASRC cgeqrt.f cgeqrt2.f cgeqrt3.f cgemqrt.f ctpqrt.f ctpqrt2.f ctpmqrt.f ctprfb.f cgelqt.f cgelqt3.f cgemlqt.f - cgetsls.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f + cgetsls.f cgetsqrhrt.f cgeqr.f clatsqr.f clamtsqr.f cgemqr.f cgelq.f claswlq.f clamswlq.f cgemlq.f ctplqt.f ctplqt2.f ctpmlqt.f chetrd_2stage.f chetrd_he2hb.f chetrd_hb2st.F chb2st_kernels.f cheevd_2stage.f cheev_2stage.f cheevx_2stage.f cheevr_2stage.f chbev_2stage.f chbevx_2stage.f chbevd_2stage.f chegv_2stage.f cgesvdq.f claunhr_col_getrfnp.f claunhr_col_getrfnp2.f - cungtsqr.f cunhr_col.f ) + cungtsqr.f cungtsqr_row.f cunhr_col.f ) set(CXLASRC cgesvxx.f cgerfsx.f cla_gerfsx_extended.f cla_geamv.f cla_gercond_c.f cla_gercond_x.f cla_gerpvgrw.f @@ -253,7 +253,7 @@ set(DLASRC dlaqgb.f dlaqge.f dlaqp2.f dlaqps.f dlaqsb.f dlaqsp.f dlaqsy.f dlaqr0.f dlaqr1.f dlaqr2.f dlaqr3.f dlaqr4.f dlaqr5.f dlaqtr.f dlar1v.f dlar2v.f iladlr.f iladlc.f - dlarf.f dlarfb.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f + dlarf.f dlarfb.f dlarfb_gett.f dlarfg.f dlarfgp.f dlarft.f dlarfx.f dlarfy.f dlargv.f dlarrv.f dlartv.f dlarz.f dlarzb.f dlarzt.f dlasy2.f dlasyf.f dlasyf_rook.f dlasyf_rk.f dlasyf_aa.f @@ -300,14 +300,14 @@ set(DLASRC dgeqrt.f dgeqrt2.f dgeqrt3.f dgemqrt.f dtpqrt.f dtpqrt2.f dtpmqrt.f dtprfb.f dgelqt.f dgelqt3.f dgemlqt.f - dgetsls.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f + dgetsls.f dgetsqrhrt.f dgeqr.f dlatsqr.f dlamtsqr.f dgemqr.f dgelq.f dlaswlq.f dlamswlq.f dgemlq.f dtplqt.f dtplqt2.f dtpmlqt.f dsytrd_2stage.f dsytrd_sy2sb.f dsytrd_sb2st.F dsb2st_kernels.f dsyevd_2stage.f dsyev_2stage.f dsyevx_2stage.f dsyevr_2stage.f dsbev_2stage.f dsbevx_2stage.f dsbevd_2stage.f dsygv_2stage.f dcombssq.f dgesvdq.f dlaorhr_col_getrfnp.f - dlaorhr_col_getrfnp2.f dorgtsqr.f dorhr_col.f ) + dlaorhr_col_getrfnp2.f dorgtsqr.f dorgtsqr_row.f dorhr_col.f ) set(DXLASRC dgesvxx.f dgerfsx.f dla_gerfsx_extended.f dla_geamv.f dla_gercond.f dla_gerpvgrw.f dsysvxx.f dsyrfsx.f @@ -360,7 +360,7 @@ set(ZLASRC zlaqhb.f zlaqhe.f zlaqhp.f zlaqp2.f zlaqps.f zlaqsb.f zlaqr0.f zlaqr1.f zlaqr2.f zlaqr3.f zlaqr4.f zlaqr5.f zlaqsp.f zlaqsy.f zlar1v.f zlar2v.f ilazlr.f ilazlc.f - zlarcm.f zlarf.f zlarfb.f + zlarcm.f zlarf.f zlarfb.f zlarfb_gett.f zlarfg.f zlarfgp.f zlarft.f zlarfx.f zlarfy.f zlargv.f zlarnv.f zlarrv.f zlartg.f zlartv.f zlarz.f zlarzb.f zlarzt.f zlascl.f zlaset.f zlasr.f @@ -402,13 +402,13 @@ set(ZLASRC ztpqrt.f ztpqrt2.f ztpmqrt.f ztprfb.f ztplqt.f ztplqt2.f ztpmlqt.f zgelqt.f zgelqt3.f zgemlqt.f - zgetsls.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f + zgetsls.f zgetsqrhrt.f zgeqr.f zlatsqr.f zlamtsqr.f zgemqr.f zgelq.f zlaswlq.f zlamswlq.f zgemlq.f zhetrd_2stage.f zhetrd_he2hb.f zhetrd_hb2st.F zhb2st_kernels.f zheevd_2stage.f zheev_2stage.f zheevx_2stage.f zheevr_2stage.f zhbev_2stage.f zhbevx_2stage.f zhbevd_2stage.f zhegv_2stage.f zgesvdq.f zlaunhr_col_getrfnp.f zlaunhr_col_getrfnp2.f - zungtsqr.f zunhr_col.f) + zungtsqr.f zungtsqr_row.f zunhr_col.f) set(ZXLASRC zgesvxx.f zgerfsx.f zla_gerfsx_extended.f zla_geamv.f zla_gercond_c.f zla_gercond_x.f zla_gerpvgrw.f zsysvxx.f zsyrfsx.f diff --git a/cmake/lapacke.cmake b/cmake/lapacke.cmake index 54a583887..340ea6d6c 100644 --- a/cmake/lapacke.cmake +++ b/cmake/lapacke.cmake @@ -114,6 +114,8 @@ set(CSRC lapacke_cgetrs_work.c lapacke_cgetsls.c lapacke_cgetsls_work.c + lapacke_cgetsqrhrt.c + lapacke_cgetsqrhrt_work.c lapacke_cggbak.c lapacke_cggbak_work.c lapacke_cggbal.c @@ -590,6 +592,8 @@ set(CSRC lapacke_cungrq_work.c lapacke_cungtr.c lapacke_cungtr_work.c + lapacke_cungtsqr_row.c + lapacke_cungtsqr_row_work.c lapacke_cunmbr.c lapacke_cunmbr_work.c lapacke_cunmhr.c @@ -735,6 +739,8 @@ set(DSRC lapacke_dgetrs_work.c lapacke_dgetsls.c lapacke_dgetsls_work.c + lapacke_dgetsqrhrt.c + lapacke_dgetsqrhrt_work.c lapacke_dggbak.c lapacke_dggbak_work.c lapacke_dggbal.c @@ -862,6 +868,8 @@ set(DSRC lapacke_dorgrq_work.c lapacke_dorgtr.c lapacke_dorgtr_work.c + lapacke_dorgtsqr_row.c + lapacke_dorgtsqr_row_work.c lapacke_dormbr.c lapacke_dormbr_work.c lapacke_dormhr.c @@ -1309,6 +1317,8 @@ set(SSRC lapacke_sgetrs_work.c lapacke_sgetsls.c lapacke_sgetsls_work.c + lapacke_sgetsqrhrt.c + lapacke_sgetsqrhrt_work.c lapacke_sggbak.c lapacke_sggbak_work.c lapacke_sggbal.c @@ -1435,6 +1445,8 @@ set(SSRC lapacke_sorgrq_work.c lapacke_sorgtr.c lapacke_sorgtr_work.c + lapacke_sorgtsqr_row.c + lapacke_sorgtsqr_row_work.c lapacke_sormbr.c lapacke_sormbr_work.c lapacke_sormhr.c @@ -1877,6 +1889,8 @@ set(ZSRC lapacke_zgetrs_work.c lapacke_zgetsls.c lapacke_zgetsls_work.c + lapacke_zgetsqrhrt.c + lapacke_zgetsqrhrt_work.c lapacke_zggbak.c lapacke_zggbak_work.c lapacke_zggbal.c @@ -2351,6 +2365,8 @@ set(ZSRC lapacke_zungrq_work.c lapacke_zungtr.c lapacke_zungtr_work.c + lapacke_zungtsqr_row.c + lapacke_zungtsqr_row_work.c lapacke_zunmbr.c lapacke_zunmbr_work.c lapacke_zunmhr.c diff --git a/cmake/system.cmake b/cmake/system.cmake index eee429113..d6c71b774 100644 --- a/cmake/system.cmake +++ b/cmake/system.cmake @@ -299,6 +299,10 @@ if (NO_AVX2) set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX2") endif () +if (NO_AVX512) + set(CCOMMON_OPT "${CCOMMON_OPT} -DNO_AVX512") +endif () + if (USE_THREAD) # USE_SIMPLE_THREADED_LEVEL3 = 1 # NO_AFFINITY = 1 diff --git a/common.h b/common.h index 862e0b4db..ac795937c 100644 --- a/common.h +++ b/common.h @@ -416,6 +416,15 @@ please https://github.com/xianyi/OpenBLAS/issues/246 #include "common_alpha.h" #endif +#if (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(__CET__) && defined(__has_include) +#if __has_include() +#include +#endif +#endif +#ifndef _CET_ENDBR +#define _CET_ENDBR +#endif + #ifdef ARCH_X86 #include "common_x86.h" #endif diff --git a/common_x86.h b/common_x86.h index ec928e236..bc77eca58 100644 --- a/common_x86.h +++ b/common_x86.h @@ -340,7 +340,8 @@ REALNAME: .align 16; \ .globl REALNAME ;\ .type REALNAME, @function; \ -REALNAME: +REALNAME: \ + _CET_ENDBR #ifdef PROFILE #define PROFCODE call mcount diff --git a/common_x86_64.h b/common_x86_64.h index b813336c6..729a055ce 100644 --- a/common_x86_64.h +++ b/common_x86_64.h @@ -451,7 +451,8 @@ REALNAME: .align 512; \ .globl REALNAME ;\ .type REALNAME, @function; \ -REALNAME: +REALNAME: \ + _CET_ENDBR #ifdef PROFILE #define PROFCODE call *mcount@GOTPCREL(%rip) diff --git a/ctest/c_cblas2.c b/ctest/c_cblas2.c index 057096f32..6511e5271 100644 --- a/ctest/c_cblas2.c +++ b/ctest/c_cblas2.c @@ -20,7 +20,7 @@ void F77_cgemv(int *order, char *transp, int *m, int *n, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*LDA*sizeof( CBLAS_TEST_COMPLEX) ); + A = (CBLAS_TEST_COMPLEX *)malloc( (*m)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -50,7 +50,7 @@ void F77_cgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; - A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=( CBLAS_TEST_COMPLEX* )malloc((*n+*kl)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; @@ -94,7 +94,7 @@ void F77_cgeru(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX*)malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -122,7 +122,7 @@ void F77_cgerc(int *order, int *m, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX* )malloc((*m)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A=(CBLAS_TEST_COMPLEX* )malloc((*m)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -154,7 +154,7 @@ void F77_chemv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = (CBLAS_TEST_COMPLEX *)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A = (CBLAS_TEST_COMPLEX *)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ){ A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -190,7 +190,7 @@ int i,irow,j,jcol,LDA; *incx, beta, y, *incy ); else { LDA = *k+2; - A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A =(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -251,8 +251,8 @@ void F77_chpmv(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, beta, y, *incy); else { LDA = *n; - A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); - AP = (CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX )); + AP = (CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -311,7 +311,7 @@ void F77_ctbmv(int *order, char *uplow, char *transp, char *diagn, x, *incx); else { LDA = *k+2; - A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX *)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -375,7 +375,7 @@ void F77_ctbsv(int *order, char *uplow, char *transp, char *diagn, *incx); else { LDA = *k+2; - A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*LDA*sizeof(CBLAS_TEST_COMPLEX )); + A=(CBLAS_TEST_COMPLEX*)malloc((*n+*k)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -436,8 +436,8 @@ void F77_ctpmv(int *order, char *uplow, char *transp, char *diagn, cblas_ctpmv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); - AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); + AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -491,8 +491,8 @@ void F77_ctpsv(int *order, char *uplow, char *transp, char *diagn, cblas_ctpsv( CblasRowMajor, UNDEFINED, trans, diag, *n, ap, x, *incx ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); - AP=(CBLAS_TEST_COMPLEX*)malloc((((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX)); + AP=(CBLAS_TEST_COMPLEX*)malloc(((((size_t)LDA+1)*LDA)/2)* sizeof(CBLAS_TEST_COMPLEX)); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -544,7 +544,7 @@ void F77_ctrmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA=*n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX)); + A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX)); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -573,7 +573,7 @@ void F77_ctrsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A =(CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A =(CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { A[ LDA*i+j ].real=a[ (*lda)*j+i ].real; @@ -601,8 +601,8 @@ void F77_chpr(int *order, char *uplow, int *n, float *alpha, cblas_chpr(CblasRowMajor, UNDEFINED, *n, *alpha, x, *incx, ap ); else { LDA = *n; - A = (CBLAS_TEST_COMPLEX* )malloc(LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); - AP = ( CBLAS_TEST_COMPLEX* )malloc( (((LDA+1)*LDA)/2)* + A = (CBLAS_TEST_COMPLEX* )malloc((size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + AP = ( CBLAS_TEST_COMPLEX* )malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -678,8 +678,8 @@ void F77_chpr2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, *incy, ap ); else { LDA = *n; - A=(CBLAS_TEST_COMPLEX*)malloc( LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); - AP=(CBLAS_TEST_COMPLEX*)malloc( (((LDA+1)*LDA)/2)* + A=(CBLAS_TEST_COMPLEX*)malloc( (size_t)LDA*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + AP=(CBLAS_TEST_COMPLEX*)malloc( ((((size_t)LDA+1)*LDA)/2)* sizeof( CBLAS_TEST_COMPLEX )); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) @@ -750,7 +750,7 @@ void F77_cher(int *order, char *uplow, int *n, float *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A=(CBLAS_TEST_COMPLEX*)malloc((*n)*LDA*sizeof( CBLAS_TEST_COMPLEX )); + A=(CBLAS_TEST_COMPLEX*)malloc((*n)*(size_t)LDA*sizeof( CBLAS_TEST_COMPLEX )); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { @@ -784,7 +784,7 @@ void F77_cher2(int *order, char *uplow, int *n, CBLAS_TEST_COMPLEX *alpha, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*LDA*sizeof(CBLAS_TEST_COMPLEX ) ); + A= ( CBLAS_TEST_COMPLEX* )malloc((*n)*(size_t)LDA*sizeof(CBLAS_TEST_COMPLEX ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) { diff --git a/ctest/c_dblas2.c b/ctest/c_dblas2.c index 423a58748..ae3854c0e 100644 --- a/ctest/c_dblas2.c +++ b/ctest/c_dblas2.c @@ -19,7 +19,7 @@ void F77_dgemv(int *order, char *transp, int *m, int *n, double *alpha, get_transpose_type(transp, &trans); if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -43,7 +43,7 @@ void F77_dger(int *order, int *m, int *n, double *alpha, double *x, int *incx, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*m)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*m)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*m; i++ ) { for( j=0; j<*n; j++ ) @@ -74,7 +74,7 @@ void F77_dtrmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -102,7 +102,7 @@ void F77_dtrsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -123,7 +123,7 @@ void F77_dsymv(int *order, char *uplow, int *n, double *alpha, double *a, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -146,7 +146,7 @@ void F77_dsyr(int *order, char *uplow, int *n, double *alpha, double *x, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -170,7 +170,7 @@ void F77_dsyr2(int *order, char *uplow, int *n, double *alpha, double *x, if (*order == TEST_ROW_MJR) { LDA = *n+1; - A = ( double* )malloc( (*n)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*n; i++ ) for( j=0; j<*n; j++ ) A[ LDA*i+j ]=a[ (*lda)*j+i ]; @@ -196,7 +196,7 @@ void F77_dgbmv(int *order, char *transp, int *m, int *n, int *kl, int *ku, if (*order == TEST_ROW_MJR) { LDA = *ku+*kl+2; - A = ( double* )malloc( (*n+*kl)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*kl)*(size_t)LDA*sizeof( double ) ); for( i=0; i<*ku; i++ ){ irow=*ku+*kl-i; jcol=(*ku)-i; @@ -236,7 +236,7 @@ void F77_dtbmv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -282,7 +282,7 @@ void F77_dtbsv(int *order, char *uplow, char *transp, char *diagn, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -325,7 +325,7 @@ void F77_dsbmv(int *order, char *uplow, int *n, int *k, double *alpha, if (*order == TEST_ROW_MJR) { LDA = *k+1; - A = ( double* )malloc( (*n+*k)*LDA*sizeof( double ) ); + A = ( double* )malloc( (*n+*k)*(size_t)LDA*sizeof( double ) ); if (uplo == CblasUpper) { for( i=0; i<*k; i++ ){ irow=*k-i; @@ -369,8 +369,8 @@ void F77_dspmv(int *order, char *uplow, int *n, double *alpha, double *ap, if (*order == TEST_ROW_MJR) { LDA = *n; - A = ( double* )malloc( LDA*LDA*sizeof( double ) ); - AP = ( double* )malloc( (((LDA+1)*LDA)/2)*sizeof( double ) ); + A = ( double* )malloc( (size_t)LDA*LDA*sizeof( double ) ); + AP = ( double* )malloc( ((((size_t)LDA+1)*LDA)/2)*sizeof( double ) ); if (uplo == CblasUpper) { for( j=0, k=0; j<*n; j++ ) for( i=0; i *ldb ) - msize = (*lda) * (*ldb) * sizeof(FLOAT); + msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT); else - msize = (*ldb) * (*ldb) * sizeof(FLOAT); + msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT); b = malloc(msize); if ( b == NULL ) diff --git a/interface/zimatcopy.c b/interface/zimatcopy.c index 87964e20d..ecda5ef4e 100644 --- a/interface/zimatcopy.c +++ b/interface/zimatcopy.c @@ -172,9 +172,9 @@ void CNAME( enum CBLAS_ORDER CORDER, enum CBLAS_TRANSPOSE CTRANS, blasint crows, #endif if ( *lda > *ldb ) - msize = (*lda) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*lda) * (*ldb) * sizeof(FLOAT) * 2; else - msize = (*ldb) * (*ldb) * sizeof(FLOAT) * 2; + msize = (size_t)(*ldb) * (*ldb) * sizeof(FLOAT) * 2; b = malloc(msize); if ( b == NULL ) diff --git a/interface/zrotg.c b/interface/zrotg.c index 8caa411fc..bc4f06492 100644 --- a/interface/zrotg.c +++ b/interface/zrotg.c @@ -79,8 +79,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ aa_i = fabs(da_r); } - scale = (aa_i / aa_r); - ada = aa_r * sqrt(ONE + scale * scale); + if (aa_r == ZERO) { + ada = 0.; + } else { + scale = (aa_i / aa_r); + ada = aa_r * sqrt(ONE + scale * scale); + } bb_r = fabs(db_r); bb_i = fabs(db_i); @@ -90,9 +94,12 @@ void NAME(FLOAT *DA, FLOAT *DB, FLOAT *C, FLOAT *S){ bb_i = fabs(bb_r); } - scale = (bb_i / bb_r); - adb = bb_r * sqrt(ONE + scale * scale); - + if (bb_r == ZERO) { + adb = 0.; + } else { + scale = (bb_i / bb_r); + adb = bb_r * sqrt(ONE + scale * scale); + } scale = ada + adb; aa_r = da_r / scale; diff --git a/kernel/arm64/sgemm_tcopy_16.S b/kernel/arm64/sgemm_tcopy_16.S index 12b80bdca..46198b3a2 100644 --- a/kernel/arm64/sgemm_tcopy_16.S +++ b/kernel/arm64/sgemm_tcopy_16.S @@ -270,11 +270,6 @@ All rights reserved. ldr s1, [A02] ldr s2, [A03] ldr s3, [A04] - - add A01, A01, #4 - add A02, A02, #4 - add A03, A03, #4 - add A04, A04, #4 stp s0, s1, [B04] add B04, B04, #8 @@ -285,11 +280,6 @@ All rights reserved. ldr s5, [A06] ldr s6, [A07] ldr s7, [A08] - - ldr d4, [A05], #8 - ldr d5, [A06], #8 - ldr d6, [A07], #8 - ldr d7, [A08], #8 stp s4, s5, [B04] add B04, B04, #8 diff --git a/kernel/power/KERNEL.POWER10 b/kernel/power/KERNEL.POWER10 index 1cf7b0b7c..873653f1e 100644 --- a/kernel/power/KERNEL.POWER10 +++ b/kernel/power/KERNEL.POWER10 @@ -169,8 +169,13 @@ ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c +ifeq ($(C_COMPILER), PGI) +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c +else CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c +endif # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c @@ -181,7 +186,7 @@ ZSWAPKERNEL = zswap.c SGEMVNKERNEL = sgemv_n.c DGEMVNKERNEL = dgemv_n_power10.c CGEMVNKERNEL = cgemv_n.c -ZGEMVNKERNEL = zgemv_n_4.c +ZGEMVNKERNEL = zgemv_n_power10.c # SGEMVTKERNEL = sgemv_t.c DGEMVTKERNEL = dgemv_t_power10.c diff --git a/kernel/power/KERNEL.POWER8 b/kernel/power/KERNEL.POWER8 index c2f4cd204..2b8e65948 100644 --- a/kernel/power/KERNEL.POWER8 +++ b/kernel/power/KERNEL.POWER8 @@ -242,8 +242,13 @@ ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c +ifeq ($(C_COMPILER), PGI) +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c +else CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c +endif # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c diff --git a/kernel/power/KERNEL.POWER9 b/kernel/power/KERNEL.POWER9 index 2bd2516de..b6b102b3e 100644 --- a/kernel/power/KERNEL.POWER9 +++ b/kernel/power/KERNEL.POWER9 @@ -166,8 +166,13 @@ ZROTKERNEL = zrot.c # SSCALKERNEL = sscal.c DSCALKERNEL = dscal.c +ifeq ($(C_COMPILER), PGI) +CSCALKERNEL = ../arm/zscal.c +ZSCALKERNEL = ../arm/zscal.c +else CSCALKERNEL = zscal.c ZSCALKERNEL = zscal.c +endif # SSWAPKERNEL = sswap.c DSWAPKERNEL = dswap.c diff --git a/kernel/power/cdot.c b/kernel/power/cdot.c index c53fe0c02..b9e2d2ce5 100644 --- a/kernel/power/cdot.c +++ b/kernel/power/cdot.c @@ -28,7 +28,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #else #include "common.h" -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "cdot_microk_power10.c" #else #ifndef HAVE_KERNEL_8 @@ -120,7 +120,7 @@ OPENBLAS_COMPLEX_FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLA if ((inc_x == 1) && (inc_y == 1)) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) BLASLONG n1 = n & -16; #else BLASLONG n1 = n & -8; diff --git a/kernel/power/cswap.c b/kernel/power/cswap.c index 4d9b9ccd6..c2fde1c44 100644 --- a/kernel/power/cswap.c +++ b/kernel/power/cswap.c @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "cswap_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "cswap_microk_power10.c" +#elif defined(POWER10) +#include "cswap_microk_power8.c" #endif #endif diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c index 0cdec3292..7507621cf 100644 --- a/kernel/power/dasum.c +++ b/kernel/power/dasum.c @@ -49,8 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dasum_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "dasum_microk_power10.c" +#elif defined(POWER10) +#include "dasum_microk_power8.c" #endif #endif @@ -112,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dgemm_kernel_power10.c b/kernel/power/dgemm_kernel_power10.c index e918e61c3..cdd846891 100644 --- a/kernel/power/dgemm_kernel_power10.c +++ b/kernel/power/dgemm_kernel_power10.c @@ -190,10 +190,9 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, __vector_quad acc0, acc1, acc2, acc3, acc4,acc5,acc6,acc7; BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; - vec_t *rb = (vec_t *) & BO[0]; __vector_pair rowB, rowB1; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -205,9 +204,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 3]; - rb = (vec_t *) & BO[l << 3]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -247,9 +245,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[1]); @@ -257,9 +254,8 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; - rb = (vec_t *) & BO[l << 3]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[1]); @@ -291,17 +287,15 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB, rowB1; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[0])); + rowB1 = *((__vector_pair *)((void *)&BO[4])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB1, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; - rb = (vec_t *) & BO[l << 3]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); - __builtin_vsx_assemble_pair (&rowB1, rb[3], rb[2]); + rowB = *((__vector_pair *)((void *)&BO[l << 3])); + rowB1 = *((__vector_pair *)((void *)&BO[(l << 3) + 4])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB1, rowA[0]); } @@ -403,8 +397,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); __builtin_mma_xvf64ger (&acc2, rowB, rowA[2]); @@ -412,8 +405,7 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 3]; - rb = (vec_t *) & BO[l << 2]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); __builtin_mma_xvf64gerpp (&acc2, rowB, rowA[2]); @@ -445,15 +437,13 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); __builtin_mma_xvf64ger (&acc1, rowB, rowA[1]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 2]; - rb = (vec_t *) & BO[l << 2]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); __builtin_mma_xvf64gerpp (&acc1, rowB, rowA[1]); } @@ -481,14 +471,12 @@ CNAME (BLASLONG m, BLASLONG n, BLASLONG k, FLOAT alpha, FLOAT * A, FLOAT * B, BLASLONG l = 0; vec_t *rowA = (vec_t *) & AO[0]; __vector_pair rowB; - vec_t *rb = (vec_t *) & BO[0]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[0])); __builtin_mma_xvf64ger (&acc0, rowB, rowA[0]); for (l = 1; l < temp; l++) { rowA = (vec_t *) & AO[l << 1]; - rb = (vec_t *) & BO[l << 2]; - __builtin_vsx_assemble_pair (&rowB, rb[1], rb[0]); + rowB = *((__vector_pair *)((void *)&BO[l << 2])); __builtin_mma_xvf64gerpp (&acc0, rowB, rowA[0]); } SAVE_ACC (&acc0, 0); diff --git a/kernel/power/drot.c b/kernel/power/drot.c index 94d9d95a3..3229878e4 100644 --- a/kernel/power/drot.c +++ b/kernel/power/drot.c @@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "drot_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "drot_microk_power10.c" +#elif defined(POWER10) +#include "drot_microk_power8.c" #endif #endif @@ -117,7 +119,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c index 96c4e51bc..32c39a8f4 100644 --- a/kernel/power/dscal.c +++ b/kernel/power/dscal.c @@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dscal_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "dscal_microk_power10.c" +#elif defined(POWER10) +#include "dscal_microk_power8.c" #endif #endif @@ -102,7 +104,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; @@ -136,7 +138,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/dswap.c b/kernel/power/dswap.c index 9e6229c6a..12476965b 100644 --- a/kernel/power/dswap.c +++ b/kernel/power/dswap.c @@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "dswap_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "swap_microk_power10.c" +#elif defined(POWER10) +#include "dswap_microk_power8.c" #endif #endif @@ -117,7 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 3) & 0x3; diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c index af692a7fa..991d27508 100644 --- a/kernel/power/sasum.c +++ b/kernel/power/sasum.c @@ -49,8 +49,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sasum_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "sasum_microk_power10.c" +#elif defined(POWER10) +#include "sasum_microk_power8.c" #endif #endif @@ -112,7 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) if ( inc_x == 1 ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/srot.c b/kernel/power/srot.c index 3e4f93e2a..5a0d4b12e 100644 --- a/kernel/power/srot.c +++ b/kernel/power/srot.c @@ -42,8 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "srot_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "srot_microk_power10.c" +#elif defined(POWER10) +#include "srot_microk_power8.c" #endif #endif @@ -117,7 +119,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT if ( (inc_x == 1) && (inc_y == 1) ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 16 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c index 65572a8c1..9ae9ccab8 100644 --- a/kernel/power/sscal.c +++ b/kernel/power/sscal.c @@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sscal_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "sscal_microk_power10.c" +#elif defined(POWER10) +#include "sscal_microk_power8.c" #endif #endif @@ -104,7 +106,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS if ( da == 0.0 ) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; @@ -138,7 +140,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS else { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 32 ) { BLASLONG align = ((32 - ((uintptr_t)x & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/sswap.c b/kernel/power/sswap.c index dd249fd36..955ed02f0 100644 --- a/kernel/power/sswap.c +++ b/kernel/power/sswap.c @@ -38,8 +38,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "sswap_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "swap_microk_power10.c" +#elif defined(POWER10) +#include "sswap_microk_power8.c" #endif #endif @@ -117,7 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT dummy3, FLOAT *x, if ( (inc_x == 1) && (inc_y == 1 )) { -#if defined(POWER10) +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) if ( n >= 64 ) { BLASLONG align = ((32 - ((uintptr_t)y & (uintptr_t)0x1F)) >> 2) & 0x7; diff --git a/kernel/power/zgemv_n_power10.c b/kernel/power/zgemv_n_power10.c new file mode 100644 index 000000000..f5bb8d70e --- /dev/null +++ b/kernel/power/zgemv_n_power10.c @@ -0,0 +1,1102 @@ +/*************************************************************************** +Copyright (c) 2018, The OpenBLAS Project +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are +met: +1. Redistributions of source code must retain the above copyright +notice, this list of conditions and the following disclaimer. +2. Redistributions in binary form must reproduce the above copyright +notice, this list of conditions and the following disclaimer in +the documentation and/or other materials provided with the +distribution. +3. Neither the name of the OpenBLAS project nor the names of +its contributors may be used to endorse or promote products +derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE +LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE +USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + +#include +#include +#include "common.h" + +#if defined(__VEC__) || defined(__ALTIVEC__) + +#define HAVE_KERNEL_4x4_VEC 1 +#define HAVE_KERNEL_4x2_VEC 1 +#define HAVE_KERNEL_4x1_VEC 1 +#define HAVE_KERNEL_ADDY 1 + +#if defined(HAVE_KERNEL_4x4_VEC) || defined(HAVE_KERNEL_4x2_VEC) || defined(HAVE_KERNEL_4x1_VEC) +#include +#endif +#endif + +// +#define NBMAX 4096 + +#ifdef HAVE_KERNEL_4x4_VEC_ASM + +#elif HAVE_KERNEL_4x4_VEC +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) +#define SAVE_RESULT(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][0] = result[0][0] - result[1][1]; \ + result[0][1] = result[0][1] + result[1][0]; \ + result[1][0] = result[2][0] - result[3][1]; \ + result[1][1] = result[2][1] + result[3][0]; \ + rowC = (v4sf_t *) &y[i2 + J]; \ + rowC[0] += result[0]; \ + rowC[1] += result[1]; +#else +#define SAVE_RESULT(ACC, J) \ + __builtin_mma_disassemble_acc ((void *)result, ACC); \ + result[0][0] = result[0][0] + result[1][1]; \ + result[0][1] = result[0][1] - result[1][0]; \ + result[1][0] = result[2][0] + result[3][1]; \ + result[1][1] = result[2][1] - result[3][0]; \ + rowC = (v4sf_t *) &y[i2 + J]; \ + rowC[0] += result[0]; \ + rowC[1] += result[1]; +#endif + +static void zgemv_kernel_4x8(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3, *a4, *a5, *a6, *a7; + __vector_quad acc0, acc1, acc2, acc3; + v4sf_t result[4]; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + a4 = a3 + lda; + a5 = a4 + lda; + a6 = a5 + lda; + a7 = a6 + lda; + + register __vector double vx0_r = {x[0], x[1]}; + register __vector double vx1_r = {x[2], x[3]}; + register __vector double vx2_r = {x[4], x[5]}; + register __vector double vx3_r = {x[6], x[7]}; + register __vector double vx4_r = {x[8], x[9]}; + register __vector double vx5_r = {x[10], x[11]}; + register __vector double vx6_r = {x[12], x[13]}; + register __vector double vx7_r = {x[14], x[15]}; + __vector_pair *Va0, *Va1, *Va2, *Va3; + __vector_pair *Va4, *Va5, *Va6, *Va7; + BLASLONG i = 0, i2 = 0; + v4sf_t *rowC; + BLASLONG tmp = (n / 8) * 8; + for (i = 0; i < tmp; i += 8) { + i2 = i*2; + Va0 = ((__vector_pair*)((void*)&a0[i2])); + Va1 = ((__vector_pair*)((void*)&a1[i2])); + Va2 = ((__vector_pair*)((void*)&a2[i2])); + Va3 = ((__vector_pair*)((void*)&a3[i2])); + Va4 = ((__vector_pair*)((void*)&a4[i2])); + Va5 = ((__vector_pair*)((void*)&a5[i2])); + Va6 = ((__vector_pair*)((void*)&a6[i2])); + Va7 = ((__vector_pair*)((void*)&a7[i2])); + + __builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r); + __builtin_mma_xvf64ger (&acc2, Va0[2], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc3, Va0[3], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc2, Va1[2], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc3, Va1[3], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc2, Va2[2], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc3, Va2[3], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc2, Va3[2], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc3, Va3[3], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc2, Va4[2], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc3, Va4[3], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc2, Va5[2], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc3, Va5[3], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc2, Va6[2], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc3, Va6[3], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc2, Va7[2], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc3, Va7[3], (vec_t ) vx7_r); + SAVE_RESULT(&acc0, 0); + SAVE_RESULT(&acc1, 4); + SAVE_RESULT(&acc2, 8); + SAVE_RESULT(&acc3, 12); + } + while (i < n) { + i2 = i*2; + Va0 = ((__vector_pair*)((void*)&a0[i2])); + Va1 = ((__vector_pair*)((void*)&a1[i2])); + Va2 = ((__vector_pair*)((void*)&a2[i2])); + Va3 = ((__vector_pair*)((void*)&a3[i2])); + Va4 = ((__vector_pair*)((void*)&a4[i2])); + Va5 = ((__vector_pair*)((void*)&a5[i2])); + Va6 = ((__vector_pair*)((void*)&a6[i2])); + Va7 = ((__vector_pair*)((void*)&a7[i2])); + + __builtin_mma_xvf64ger (&acc0, Va0[0], (vec_t ) vx0_r); + __builtin_mma_xvf64ger (&acc1, Va0[1], (vec_t ) vx0_r); + __builtin_mma_xvf64gerpp (&acc0, Va1[0], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], (vec_t ) vx1_r); + __builtin_mma_xvf64gerpp (&acc0, Va2[0], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc1, Va2[1], (vec_t ) vx2_r); + __builtin_mma_xvf64gerpp (&acc0, Va3[0], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc1, Va3[1], (vec_t ) vx3_r); + __builtin_mma_xvf64gerpp (&acc0, Va4[0], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc1, Va4[1], (vec_t ) vx4_r); + __builtin_mma_xvf64gerpp (&acc0, Va5[0], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc1, Va5[1], (vec_t ) vx5_r); + __builtin_mma_xvf64gerpp (&acc0, Va6[0], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc1, Va6[1], (vec_t ) vx6_r); + __builtin_mma_xvf64gerpp (&acc0, Va7[0], (vec_t ) vx7_r); + __builtin_mma_xvf64gerpp (&acc1, Va7[1], (vec_t ) vx7_r); + SAVE_RESULT(&acc0, 0); + SAVE_RESULT(&acc1, 4); + i += 4; + } +} +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + register __vector double vx2_r = {x[4], x[4]}; + register __vector double vx2_i = {-x[5], x[5]}; + register __vector double vx3_r = {x[6], x[6]}; + register __vector double vx3_i = {-x[7], x[7]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; + register __vector double vx2_r = {x[4], -x[4]}; + register __vector double vx2_i = {x[5], x[5]}; + register __vector double vx3_r = {x[6], -x[6]}; + register __vector double vx3_i = {x[7], x[7]}; +#endif + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + register __vector double *vptr_a2 = (__vector double *) a2; + register __vector double *vptr_a3 = (__vector double *) a3; + + + register __vector double vy_0; + register __vector double va0; + register __vector double va1; + register __vector double va2; + register __vector double va3; + register __vector double vy_1; + register __vector double va0_1; + register __vector double va1_1; + register __vector double va2_1; + register __vector double va3_1; + register __vector double vy_2; + register __vector double va0_2; + register __vector double va1_2; + register __vector double va2_2; + register __vector double va3_2; + register __vector double vy_3; + register __vector double va0_3; + register __vector double va1_3; + register __vector double va2_3; + register __vector double va3_3; + + BLASLONG i = 0; + while (i < n) { + + vy_0 = vy[i]; + va0 = vptr_a0[i]; + va1 = vptr_a1[i]; + va2 = vptr_a2[i]; + va3 = vptr_a3[i]; + + vy_1 = vy[i + 1]; + va0_1 = vptr_a0[i + 1]; + va1_1 = vptr_a1[i + 1]; + va2_1 = vptr_a2[i + 1]; + va3_1 = vptr_a3[i + 1]; + + vy_2 = vy[i + 2]; + va0_2 = vptr_a0[i + 2]; + va1_2 = vptr_a1[i + 2]; + va2_2 = vptr_a2[i + 2]; + va3_2 = vptr_a3[i + 2]; + + vy_3 = vy[i + 3]; + va0_3 = vptr_a0[i + 3]; + va1_3 = vptr_a1[i + 3]; + va2_3 = vptr_a2[i + 3]; + va3_3 = vptr_a3[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + + + vy_0 += va2*vx2_r; + vy_1 += va2_1*vx2_r; + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_2 += va2_2*vx2_r; + vy_3 += va2_3*vx2_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + + + vy_0 += va3*vx3_r; + vy_1 += va3_1*vx3_r; + + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_2 += va3_2*vx3_r; + vy_3 += va3_3*vx3_r; + + va2 = vec_xxpermdi(va2, va2, 2); + va2_1 = vec_xxpermdi(va2_1, va2_1, 2); + + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + + va2_2 = vec_xxpermdi(va2_2, va2_2, 2); + va2_3 = vec_xxpermdi(va2_3, va2_3, 2); + + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + va3 = vec_xxpermdi(va3, va3, 2); + va3_1 = vec_xxpermdi(va3_1, va3_1, 2); + + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + + va3_2 = vec_xxpermdi(va3_2, va3_2, 2); + va3_3 = vec_xxpermdi(va3_3, va3_3, 2); + + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy_0 += va2*vx2_i; + vy_1 += va2_1*vx2_i; + vy_2 += va2_2*vx2_i; + vy_3 += va2_3*vx2_i; + + vy_0 += va3*vx3_i; + vy_1 += va3_1*vx3_i; + vy_2 += va3_2*vx3_i; + vy_3 += va3_3*vx3_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + + i += 4; + + + } + +} +#else + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] - a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] + a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] - a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] + a3[i + 1] * x[6]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; + y[i] += a2[i] * x[4] + a2[i + 1] * x[5]; + y[i + 1] += a2[i] * x[5] - a2[i + 1] * x[4]; + y[i] += a3[i] * x[6] + a3[i + 1] * x[7]; + y[i + 1] += a3[i] * x[7] - a3[i + 1] * x[6]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x2_VEC + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + register __vector double vx1_r = {x[2], x[2]}; + register __vector double vx1_i = {-x[3], x[3]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; + register __vector double vx1_r = {x[2], -x[2]}; + register __vector double vx1_i = {x[3], x[3]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + register __vector double *vptr_a1 = (__vector double *) a1; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va1 = vptr_a1[i]; + register __vector double va1_1 = vptr_a1[i + 1]; + register __vector double va1_2 = vptr_a1[i + 2]; + register __vector double va1_3 = vptr_a1[i + 3]; + + vy_0 += va0*vx0_r; + vy_1 += va0_1*vx0_r; + vy_2 += va0_2*vx0_r; + vy_3 += va0_3*vx0_r; + + va0 = vec_xxpermdi(va0, va0, 2); + va0_1 = vec_xxpermdi(va0_1, va0_1, 2); + va0_2 = vec_xxpermdi(va0_2, va0_2, 2); + va0_3 = vec_xxpermdi(va0_3, va0_3, 2); + + vy_0 += va1*vx1_r; + vy_1 += va1_1*vx1_r; + vy_2 += va1_2*vx1_r; + vy_3 += va1_3*vx1_r; + + va1 = vec_xxpermdi(va1, va1, 2); + va1_1 = vec_xxpermdi(va1_1, va1_1, 2); + va1_2 = vec_xxpermdi(va1_2, va1_2, 2); + va1_3 = vec_xxpermdi(va1_3, va1_3, 2); + + vy_0 += va0*vx0_i; + vy_1 += va0_1*vx0_i; + vy_2 += va0_2*vx0_i; + vy_3 += va0_3*vx0_i; + + vy_0 += va1*vx1_i; + vy_1 += va1_1*vx1_i; + vy_2 += va1_2*vx1_i; + vy_3 += va1_3*vx1_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} +#else + +static void zgemv_kernel_4x2(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0, *a1; + a0 = ap; + a1 = ap + lda; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] - a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] + a1[i + 1] * x[2]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; + y[i] += a1[i] * x[2] + a1[i + 1] * x[3]; + y[i + 1] += a1[i] * x[3] - a1[i + 1] * x[2]; +#endif + } +} + +#endif + +#ifdef HAVE_KERNEL_4x1_VEC + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + register __vector double vx0_r = {x[0], x[0]}; + register __vector double vx0_i = {-x[1], x[1]}; + +#else + register __vector double vx0_r = {x[0], -x[0]}; + register __vector double vx0_i = {x[1], x[1]}; +#endif + + + register __vector double *vy = (__vector double *) y; + register __vector double *vptr_a0 = (__vector double *) a0; + + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vy[i]; + register __vector double vy_1 = vy[i + 1]; + register __vector double vy_2 = vy[i + 2]; + register __vector double vy_3 = vy[i + 3]; + + register __vector double va0 = vptr_a0[i]; + register __vector double va0_1 = vptr_a0[i + 1]; + register __vector double va0_2 = vptr_a0[i + 2]; + register __vector double va0_3 = vptr_a0[i + 3]; + + register __vector double va0x = vec_xxpermdi(va0, va0, 2); + register __vector double va0x_1 = vec_xxpermdi(va0_1, va0_1, 2); + register __vector double va0x_2 = vec_xxpermdi(va0_2, va0_2, 2); + register __vector double va0x_3 = vec_xxpermdi(va0_3, va0_3, 2); + vy_0 += va0*vx0_r + va0x*vx0_i; + vy_1 += va0_1*vx0_r + va0x_1*vx0_i; + vy_2 += va0_2*vx0_r + va0x_2*vx0_i; + vy_3 += va0_3*vx0_r + va0x_3*vx0_i; + + vy[i] = vy_0; + vy[i + 1] = vy_1; + vy[i + 2] = vy_2; + vy[i + 3] = vy_3; + + } +} + +#else + +static void zgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) { + BLASLONG i; + FLOAT *a0; + a0 = ap; + + for (i = 0; i < 2 * n; i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + y[i] += a0[i] * x[0] - a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] + a0[i + 1] * x[0]; +#else + y[i] += a0[i] * x[0] + a0[i + 1] * x[1]; + y[i + 1] += a0[i] * x[1] - a0[i + 1] * x[0]; +#endif + + } +} + +#endif + +#ifdef HAVE_KERNEL_ADDY + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + +#if !defined(XCONJ) + + register __vector double valpha_r = {alpha_r, alpha_r}; + register __vector double valpha_i = {-alpha_i, alpha_i}; + +#else + register __vector double valpha_r = {alpha_r, -alpha_r}; + register __vector double valpha_i = {alpha_i, alpha_i}; +#endif + + register __vector double *vptr_src = (__vector double *) src; + if (inc_dest != 2) { + register __vector double *vptr_y = (__vector double *) dest; + //note that inc_dest is already 2x. so we should add it to double* + register __vector double *vptr_y1 = (__vector double *) (dest + inc_dest); + register __vector double *vptr_y2 = (__vector double *) (dest + 2 * inc_dest); + register __vector double *vptr_y3 = (__vector double *) (dest + 3 * inc_dest); + BLASLONG dest_t = 0; + BLASLONG add_dest = inc_dest << 1; //inc_dest is already multiplied by 2, so for vector 4 we just multiply 2 times + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[dest_t]; + register __vector double vy_1 = vptr_y1[dest_t]; + register __vector double vy_2 = vptr_y2[dest_t]; + register __vector double vy_3 = vptr_y3[dest_t]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[dest_t] = vy_0; + vptr_y1[dest_t ] = vy_1; + vptr_y2[dest_t] = vy_2; + vptr_y3[dest_t] = vy_3; + + dest_t += add_dest; + + } + + return; + } else { + register __vector double *vptr_y = (__vector double *) dest; + for (i = 0; i < n; i += 4) { + + register __vector double vy_0 = vptr_y[i]; + register __vector double vy_1 = vptr_y[i + 1]; + register __vector double vy_2 = vptr_y[i + 2]; + register __vector double vy_3 = vptr_y[i + 3]; + + register __vector double vsrc = vptr_src[i]; + register __vector double vsrc_1 = vptr_src[i + 1]; + register __vector double vsrc_2 = vptr_src[i + 2]; + register __vector double vsrc_3 = vptr_src[i + 3]; + + vy_0 += vsrc*valpha_r; + vy_1 += vsrc_1*valpha_r; + vy_2 += vsrc_2*valpha_r; + vy_3 += vsrc_3*valpha_r; + + vsrc = vec_xxpermdi(vsrc, vsrc, 2); + vsrc_1 = vec_xxpermdi(vsrc_1, vsrc_1, 2); + vsrc_2 = vec_xxpermdi(vsrc_2, vsrc_2, 2); + vsrc_3 = vec_xxpermdi(vsrc_3, vsrc_3, 2); + + vy_0 += vsrc*valpha_i; + vy_1 += vsrc_1*valpha_i; + vy_2 += vsrc_2*valpha_i; + vy_3 += vsrc_3*valpha_i; + + vptr_y[i] = vy_0; + vptr_y[i + 1 ] = vy_1; + vptr_y[i + 2] = vy_2; + vptr_y[i + 3] = vy_3; + + } + + return; + } + return; +} + +#else + +static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + + if (inc_dest != 2) { + + FLOAT temp_r; + FLOAT temp_i; + for (i = 0; i < n; i++) { +#if !defined(XCONJ) + temp_r = alpha_r * src[0] - alpha_i * src[1]; + temp_i = alpha_r * src[1] + alpha_i * src[0]; +#else + temp_r = alpha_r * src[0] + alpha_i * src[1]; + temp_i = -alpha_r * src[1] + alpha_i * src[0]; +#endif + + *dest += temp_r; + *(dest + 1) += temp_i; + + src += 2; + dest += inc_dest; + } + return; + } + + FLOAT temp_r0; + FLOAT temp_i0; + FLOAT temp_r1; + FLOAT temp_i1; + FLOAT temp_r2; + FLOAT temp_i2; + FLOAT temp_r3; + FLOAT temp_i3; + for (i = 0; i < n; i += 4) { +#if !defined(XCONJ) + temp_r0 = alpha_r * src[0] - alpha_i * src[1]; + temp_i0 = alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] - alpha_i * src[3]; + temp_i1 = alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] - alpha_i * src[5]; + temp_i2 = alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] - alpha_i * src[7]; + temp_i3 = alpha_r * src[7] + alpha_i * src[6]; +#else + temp_r0 = alpha_r * src[0] + alpha_i * src[1]; + temp_i0 = -alpha_r * src[1] + alpha_i * src[0]; + temp_r1 = alpha_r * src[2] + alpha_i * src[3]; + temp_i1 = -alpha_r * src[3] + alpha_i * src[2]; + temp_r2 = alpha_r * src[4] + alpha_i * src[5]; + temp_i2 = -alpha_r * src[5] + alpha_i * src[4]; + temp_r3 = alpha_r * src[6] + alpha_i * src[7]; + temp_i3 = -alpha_r * src[7] + alpha_i * src[6]; +#endif + + dest[0] += temp_r0; + dest[1] += temp_i0; + dest[2] += temp_r1; + dest[3] += temp_i1; + dest[4] += temp_r2; + dest[5] += temp_i2; + dest[6] += temp_r3; + dest[7] += temp_i3; + + src += 8; + dest += 8; + } + return; +} +#endif + +int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha_r, FLOAT alpha_i, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT * buffer) { + BLASLONG i; + BLASLONG j; + FLOAT *a_ptr; + FLOAT *x_ptr; + FLOAT *y_ptr; + + BLASLONG n1; + BLASLONG m1; + BLASLONG m2; + BLASLONG m3; + BLASLONG n2; + FLOAT xbuffer[16] __attribute__((aligned(16))); + FLOAT *ybuffer; + + if (m < 1) return (0); + if (n < 1) return (0); + + ybuffer = buffer; + + inc_x *= 2; + inc_y *= 2; + lda *= 2; + + n1 = n / 8; + n2 = n % 8; + + m3 = m % 4; + m1 = m - (m % 4); + m2 = (m % NBMAX) - (m % 4); + + y_ptr = y; + + BLASLONG NB = NBMAX; + + while (NB == NBMAX) { + + m1 -= NB; + if (m1 < 0) { + if (m2 == 0) break; + NB = m2; + } + + a_ptr = a; + + x_ptr = x; + //zero_y(NB,ybuffer); + memset(ybuffer, 0, NB * 16); + + if (inc_x == 2) { + + for (i = 0; i < n1; i++) { + zgemv_kernel_4x8(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 3; + x_ptr += 16; + } + if (n2 & 4) { + zgemv_kernel_4x4(NB, lda, a_ptr, x_ptr, ybuffer); + + a_ptr += lda << 2; + x_ptr += 8; + } + + if (n2 & 2) { + zgemv_kernel_4x2(NB, lda, a_ptr, x_ptr, ybuffer); + x_ptr += 4; + a_ptr += 2 * lda; + + } + + if (n2 & 1) { + zgemv_kernel_4x1(NB, a_ptr, x_ptr, ybuffer); + x_ptr += 2; + a_ptr += lda; + + } + } else { + + for (i = 0; i < n1; i++) { + + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[2] = x_ptr[0]; + xbuffer[3] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[4] = x_ptr[0]; + xbuffer[5] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[6] = x_ptr[0]; + xbuffer[7] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[8] = x_ptr[0]; + xbuffer[9] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[10] = x_ptr[0]; + xbuffer[11] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[12] = x_ptr[0]; + xbuffer[13] = x_ptr[1]; + x_ptr += inc_x; + xbuffer[14] = x_ptr[0]; + xbuffer[15] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x8(NB, lda, a_ptr, xbuffer, ybuffer); + + a_ptr += lda << 3; + } + for (i = 0; i < n2; i++) { + xbuffer[0] = x_ptr[0]; + xbuffer[1] = x_ptr[1]; + x_ptr += inc_x; + zgemv_kernel_4x1(NB, a_ptr, xbuffer, ybuffer); + a_ptr += lda; + + } + + } + + add_y(NB, ybuffer, y_ptr, inc_y, alpha_r, alpha_i); + a += 2 * NB; + y_ptr += NB * inc_y; + } + + if (m3 == 0) return (0); + + if (m3 == 1) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r = 0.0; + FLOAT temp_i = 0.0; + + if (lda == 2 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] - a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] + a_ptr[3] * x_ptr[2]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r += a_ptr[2] * x_ptr[2] + a_ptr[3] * x_ptr[3]; + temp_i += a_ptr[2] * x_ptr[3] - a_ptr[3] * x_ptr[2]; +#endif + + a_ptr += 4; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += 2; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; +#else + temp_r += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r - alpha_i * temp_i; + y_ptr[1] += alpha_r * temp_i + alpha_i * temp_r; +#else + y_ptr[0] += alpha_r * temp_r + alpha_i * temp_i; + y_ptr[1] -= alpha_r * temp_i - alpha_i * temp_r; +#endif + return (0); + } + + if (m3 == 2) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + + if (lda == 4 && inc_x == 2) { + + for (i = 0; i < (n & -2); i += 2) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] - a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] + a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] - a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] + a_ptr[7] * x_ptr[2]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + + temp_r0 += a_ptr[4] * x_ptr[2] + a_ptr[5] * x_ptr[3]; + temp_i0 += a_ptr[4] * x_ptr[3] - a_ptr[5] * x_ptr[2]; + temp_r1 += a_ptr[6] * x_ptr[2] + a_ptr[7] * x_ptr[3]; + temp_i1 += a_ptr[6] * x_ptr[3] - a_ptr[7] * x_ptr[2]; +#endif + + a_ptr += 8; + x_ptr += 4; + } + + for (; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += 4; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; +#endif + return (0); + } + + if (m3 == 3) { + a_ptr = a; + x_ptr = x; + FLOAT temp_r0 = 0.0; + FLOAT temp_i0 = 0.0; + FLOAT temp_r1 = 0.0; + FLOAT temp_i1 = 0.0; + FLOAT temp_r2 = 0.0; + FLOAT temp_i2 = 0.0; + + if (lda == 6 && inc_x == 2) { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += 6; + x_ptr += 2; + } + + } else { + + for (i = 0; i < n; i++) { +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + temp_r0 += a_ptr[0] * x_ptr[0] - a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] + a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] - a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] + a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] - a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] + a_ptr[5] * x_ptr[0]; +#else + temp_r0 += a_ptr[0] * x_ptr[0] + a_ptr[1] * x_ptr[1]; + temp_i0 += a_ptr[0] * x_ptr[1] - a_ptr[1] * x_ptr[0]; + temp_r1 += a_ptr[2] * x_ptr[0] + a_ptr[3] * x_ptr[1]; + temp_i1 += a_ptr[2] * x_ptr[1] - a_ptr[3] * x_ptr[0]; + temp_r2 += a_ptr[4] * x_ptr[0] + a_ptr[5] * x_ptr[1]; + temp_i2 += a_ptr[4] * x_ptr[1] - a_ptr[5] * x_ptr[0]; +#endif + + a_ptr += lda; + x_ptr += inc_x; + } + + } +#if !defined(XCONJ) + y_ptr[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y_ptr[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 - alpha_i * temp_i1; + y_ptr[1] += alpha_r * temp_i1 + alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 - alpha_i * temp_i2; + y_ptr[1] += alpha_r * temp_i2 + alpha_i * temp_r2; +#else + y_ptr[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y_ptr[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r1 + alpha_i * temp_i1; + y_ptr[1] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y_ptr += inc_y; + y_ptr[0] += alpha_r * temp_r2 + alpha_i * temp_i2; + y_ptr[1] -= alpha_r * temp_i2 - alpha_i * temp_r2; +#endif + return (0); + } + + return (0); +} + diff --git a/kernel/power/zgemv_t_4.c b/kernel/power/zgemv_t_4.c index 956d75ffc..d3bf60ca7 100644 --- a/kernel/power/zgemv_t_4.c +++ b/kernel/power/zgemv_t_4.c @@ -43,6 +43,134 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #elif HAVE_KERNEL_4x4_VEC +#if defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) +typedef __vector unsigned char vec_t; +typedef FLOAT v4sf_t __attribute__ ((vector_size (16))); + + +static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { + BLASLONG i; + FLOAT *a0, *a1, *a2, *a3; + a0 = ap; + a1 = ap + lda; + a2 = a1 + lda; + a3 = a2 + lda; + __vector_quad acc0, acc1, acc2, acc3;; + __vector_quad acc4, acc5, acc6, acc7; + v4sf_t result[4]; + __vector_pair *Va0, *Va1, *Va2, *Va3; + i = 0; + n = n << 1; + __builtin_mma_xxsetaccz (&acc0); + __builtin_mma_xxsetaccz (&acc1); + __builtin_mma_xxsetaccz (&acc2); + __builtin_mma_xxsetaccz (&acc3); + __builtin_mma_xxsetaccz (&acc4); + __builtin_mma_xxsetaccz (&acc5); + __builtin_mma_xxsetaccz (&acc6); + __builtin_mma_xxsetaccz (&acc7); + while (i < n) { + + vec_t *rx = (vec_t *) & x[i]; + Va0 = ((__vector_pair*)((void*)&a0[i])); + Va1 = ((__vector_pair*)((void*)&a1[i])); + Va2 = ((__vector_pair*)((void*)&a2[i])); + Va3 = ((__vector_pair*)((void*)&a3[i])); + + __builtin_mma_xvf64gerpp (&acc0, Va0[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc1, Va1[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc2, Va2[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc3, Va3[0], rx[0]); + __builtin_mma_xvf64gerpp (&acc4, Va0[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc5, Va1[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc6, Va2[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc7, Va3[0], rx[1]); + __builtin_mma_xvf64gerpp (&acc0, Va0[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc1, Va1[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc2, Va2[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc3, Va3[1], rx[2]); + __builtin_mma_xvf64gerpp (&acc4, Va0[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc5, Va1[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc6, Va2[1], rx[3]); + __builtin_mma_xvf64gerpp (&acc7, Va3[1], rx[3]); + i += 8; + + } +#if ( !defined(CONJ) && !defined(XCONJ) ) || ( defined(CONJ) && defined(XCONJ) ) + __builtin_mma_disassemble_acc ((void *)result, &acc0); + register FLOAT temp_r0 = result[0][0] - result[1][1]; + register FLOAT temp_i0 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc4); + temp_r0 += result[2][0] - result[3][1]; + temp_i0 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + register FLOAT temp_r1 = result[0][0] - result[1][1]; + register FLOAT temp_i1 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc5); + temp_r1 += result[2][0] - result[3][1]; + temp_i1 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + register FLOAT temp_r2 = result[0][0] - result[1][1]; + register FLOAT temp_i2 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc6); + temp_r2 += result[2][0] - result[3][1]; + temp_i2 += result[2][1] + result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + register FLOAT temp_r3 = result[0][0] - result[1][1]; + register FLOAT temp_i3 = result[0][1] + result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc7); + temp_r3 += result[2][0] - result[3][1]; + temp_i3 += result[2][1] + result[3][0]; +#else + __builtin_mma_disassemble_acc ((void *)result, &acc0); + register FLOAT temp_r0 = result[0][0] + result[1][1]; + register FLOAT temp_i0 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc4); + temp_r0 += result[2][0] + result[3][1]; + temp_i0 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc1); + register FLOAT temp_r1 = result[0][0] + result[1][1]; + register FLOAT temp_i1 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc5); + temp_r1 += result[2][0] + result[3][1]; + temp_i1 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc2); + register FLOAT temp_r2 = result[0][0] + result[1][1]; + register FLOAT temp_i2 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc6); + temp_r2 += result[2][0] + result[3][1]; + temp_i2 += result[2][1] - result[3][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc3); + register FLOAT temp_r3 = result[0][0] + result[1][1]; + register FLOAT temp_i3 = result[0][1] - result[1][0]; + __builtin_mma_disassemble_acc ((void *)result, &acc7); + temp_r3 += result[2][0] + result[3][1]; + temp_i3 += result[2][1] - result[3][0]; +#endif +#if !defined(XCONJ) + + y[0] += alpha_r * temp_r0 - alpha_i * temp_i0; + y[1] += alpha_r * temp_i0 + alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 - alpha_i * temp_i1; + y[3] += alpha_r * temp_i1 + alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 - alpha_i * temp_i2; + y[5] += alpha_r * temp_i2 + alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 - alpha_i * temp_i3; + y[7] += alpha_r * temp_i3 + alpha_i * temp_r3; + +#else + + y[0] += alpha_r * temp_r0 + alpha_i * temp_i0; + y[1] -= alpha_r * temp_i0 - alpha_i * temp_r0; + y[2] += alpha_r * temp_r1 + alpha_i * temp_i1; + y[3] -= alpha_r * temp_i1 - alpha_i * temp_r1; + y[4] += alpha_r * temp_r2 + alpha_i * temp_i2; + y[5] -= alpha_r * temp_i2 - alpha_i * temp_r2; + y[6] += alpha_r * temp_r3 + alpha_i * temp_i3; + y[7] -= alpha_r * temp_i3 - alpha_i * temp_r3; +#endif +} +#else static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { BLASLONG i; FLOAT *a0, *a1, *a2, *a3; @@ -198,6 +326,7 @@ static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOA #endif } +#endif #else static void zgemv_kernel_4x4(BLASLONG n, BLASLONG lda, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT alpha_r, FLOAT alpha_i) { diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c index 0068138e8..59ddc149f 100644 --- a/kernel/power/zscal.c +++ b/kernel/power/zscal.c @@ -43,12 +43,16 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(DOUBLE) #include "zscal_microk_power8.c" #endif -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #if defined(DOUBLE) #include "zscal_microk_power10.c" #else #include "cscal_microk_power10.c" #endif +#elif defined(POWER10) +#if defined(DOUBLE) +#include "zscal_microk_power8.c" +#endif #endif #endif diff --git a/kernel/power/zswap.c b/kernel/power/zswap.c index 6cd3d9664..908802b71 100644 --- a/kernel/power/zswap.c +++ b/kernel/power/zswap.c @@ -39,8 +39,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #if defined(__VEC__) || defined(__ALTIVEC__) #if defined(POWER8) || defined(POWER9) #include "zswap_microk_power8.c" -#elif defined(POWER10) +#elif defined(POWER10) && (__BYTE_ORDER__ != __ORDER_BIG_ENDIAN__) #include "cswap_microk_power10.c" +#elif defined(POWER10) +#include "zswap_microk_power8.c" #endif #endif diff --git a/kernel/x86_64/KERNEL b/kernel/x86_64/KERNEL index 5da79cc3f..bea7036c2 100644 --- a/kernel/x86_64/KERNEL +++ b/kernel/x86_64/KERNEL @@ -491,4 +491,3 @@ SSUMKERNEL = ../arm/sum.c DSUMKERNEL = ../arm/sum.c SOMATCOPY_RT = omatcopy_rt.c -DOMATCOPY_RT = omatcopy_rt.c diff --git a/kernel/x86_64/drot_microk_haswell-2.c b/kernel/x86_64/drot_microk_haswell-2.c index 72a87696e..cc5949b1a 100644 --- a/kernel/x86_64/drot_microk_haswell-2.c +++ b/kernel/x86_64/drot_microk_haswell-2.c @@ -1,6 +1,4 @@ -/* need a new enough GCC for avx512 support */ -#if (( defined(__GNUC__) && __GNUC__ > 6 && defined(__AVX512CD__)) || (defined(__clang__) && __clang_major__ >= 9)) - +#if defined(HAVE_FMA3) && defined(HAVE_AVX2) #define HAVE_DROT_KERNEL 1 #include diff --git a/kernel/x86_64/sgemm_direct_skylakex.c b/kernel/x86_64/sgemm_direct_skylakex.c index cc2ac5553..2588289d1 100644 --- a/kernel/x86_64/sgemm_direct_skylakex.c +++ b/kernel/x86_64/sgemm_direct_skylakex.c @@ -1,10 +1,10 @@ /* the direct sgemm code written by Arjan van der Ven */ - +#include "common.h" #if defined(SKYLAKEX) || defined (COOPERLAKE) #include -#include "common.h" + /* * "Direct sgemm" code. This code operates directly on the inputs and outputs @@ -472,7 +472,7 @@ void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG s } } #else -#include "common.h" + void CNAME (BLASLONG M, BLASLONG N, BLASLONG K, float * __restrict A, BLASLONG strideA, float * __restrict B, BLASLONG strideB , float * __restrict R, BLASLONG strideR) {} #endif diff --git a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c index f3d614242..2db8b2fea 100644 --- a/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c +++ b/kernel/x86_64/sgemm_kernel_16x4_skylakex_3.c @@ -501,7 +501,11 @@ CNAME(BLASLONG m, BLASLONG n, BLASLONG k, float alpha, float * __restrict__ A, f int32_t permil[16] = {0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3}; BLASLONG n_count = n; float *a_pointer = A,*b_pointer = B,*c_pointer = C,*ctemp = C,*next_b = B; +#if defined(__clang__) + for(;n_count>23;n_count-=24) COMPUTE(24) +#else for(;n_count>23;n_count-=24) COMPUTE_n24 +#endif for(;n_count>19;n_count-=20) COMPUTE(20) for(;n_count>15;n_count-=16) COMPUTE(16) for(;n_count>11;n_count-=12) COMPUTE(12) diff --git a/lapack-netlib/LAPACKE/include/lapack.h b/lapack-netlib/LAPACKE/include/lapack.h index aedaa308d..ada1944b2 100644 --- a/lapack-netlib/LAPACKE/include/lapack.h +++ b/lapack-netlib/LAPACKE/include/lapack.h @@ -566,8 +566,8 @@ void LAPACK_cgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, lapack_complex_float const* AB, lapack_int const* ldab, lapack_complex_float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - float* R, - float* C, + const float* R, + const float* C, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -585,8 +585,8 @@ void LAPACK_dgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, double const* AB, lapack_int const* ldab, double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - double* R, - double* C, + const double* R, + const double* C, double const* B, lapack_int const* ldb, double* X, lapack_int const* ldx, double* rcond, @@ -604,8 +604,8 @@ void LAPACK_sgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, float const* AB, lapack_int const* ldab, float const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - float* R, - float* C, + const float* R, + const float* C, float const* B, lapack_int const* ldb, float* X, lapack_int const* ldx, float* rcond, @@ -623,8 +623,8 @@ void LAPACK_zgbrfsx( lapack_int const* n, lapack_int const* kl, lapack_int const* ku, lapack_int const* nrhs, lapack_complex_double const* AB, lapack_int const* ldab, lapack_complex_double const* AFB, lapack_int const* ldafb, lapack_int const* ipiv, - double* R, - double* C, + const double* R, + const double* C, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -2941,6 +2941,42 @@ void LAPACK_zgetsls( lapack_complex_double* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_cgetsqrhrt LAPACK_GLOBAL(cgetsqrhrt,CGETSQRHRT) +void LAPACK_cgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float* T, lapack_int const* ldt, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_dgetsqrhrt LAPACK_GLOBAL(dgetsqrhrt,DGETSQRHRT) +void LAPACK_dgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + double* A, lapack_int const* lda, + double* T, lapack_int const* ldt, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sgetsqrhrt LAPACK_GLOBAL(sgetsqrhrt,SGETSQRHRT) +void LAPACK_sgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + float* A, lapack_int const* lda, + float* T, lapack_int const* ldt, + float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zgetsqrhrt LAPACK_GLOBAL(zgetsqrhrt,ZGETSQRHRT) +void LAPACK_zgetsqrhrt( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb1, lapack_int const* nb1, lapack_int const* nb2, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double* T, lapack_int const* ldt, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + #define LAPACK_cggbak LAPACK_GLOBAL(cggbak,CGGBAK) void LAPACK_cggbak( char const* job, char const* side, @@ -4768,7 +4804,7 @@ void LAPACK_chegst( lapack_int const* itype, char const* uplo, lapack_int const* n, lapack_complex_float* A, lapack_int const* lda, - lapack_complex_float* B, lapack_int const* ldb, + const lapack_complex_float* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_zhegst LAPACK_GLOBAL(zhegst,ZHEGST) @@ -4776,7 +4812,7 @@ void LAPACK_zhegst( lapack_int const* itype, char const* uplo, lapack_int const* n, lapack_complex_double* A, lapack_int const* lda, - lapack_complex_double* B, lapack_int const* ldb, + const lapack_complex_double* B, lapack_int const* ldb, lapack_int* info ); #define LAPACK_chegv LAPACK_GLOBAL(chegv,CHEGV) @@ -4913,7 +4949,7 @@ void LAPACK_cherfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - float* S, + const float* S, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -4931,7 +4967,7 @@ void LAPACK_zherfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - double* S, + const double* S, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -7251,6 +7287,24 @@ void LAPACK_sorgtr( float* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_dorgtsqr_row LAPACK_GLOBAL(dorgtsqr_row,DORGTSQR_ROW) +void LAPACK_dorgtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + double* A, lapack_int const* lda, + double const* T, lapack_int const* ldt, + double* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_sorgtsqr_row LAPACK_GLOBAL(sorgtsqr_row,SORGTSQR_ROW) +void LAPACK_sorgtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + float* A, lapack_int const* lda, + float const* T, lapack_int const* ldt, + float* work, lapack_int const* lwork, + lapack_int* info ); + #define LAPACK_dormbr LAPACK_GLOBAL(dormbr,DORMBR) void LAPACK_dormbr( char const* vect, char const* side, char const* trans, @@ -8005,7 +8059,7 @@ void LAPACK_cporfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_complex_float const* AF, lapack_int const* ldaf, - float* S, + const float* S, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -8023,7 +8077,7 @@ void LAPACK_dporfsx( lapack_int const* n, lapack_int const* nrhs, double const* A, lapack_int const* lda, double const* AF, lapack_int const* ldaf, - double* S, + const double* S, double const* B, lapack_int const* ldb, double* X, lapack_int const* ldx, double* rcond, @@ -8041,7 +8095,7 @@ void LAPACK_sporfsx( lapack_int const* n, lapack_int const* nrhs, float const* A, lapack_int const* lda, float const* AF, lapack_int const* ldaf, - float* S, + const float* S, float const* B, lapack_int const* ldb, float* X, lapack_int const* ldx, float* rcond, @@ -8059,7 +8113,7 @@ void LAPACK_zporfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_complex_double const* AF, lapack_int const* ldaf, - double* S, + const double* S, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -10756,7 +10810,7 @@ void LAPACK_csyrfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_float const* A, lapack_int const* lda, lapack_complex_float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - float* S, + const float* S, lapack_complex_float const* B, lapack_int const* ldb, lapack_complex_float* X, lapack_int const* ldx, float* rcond, @@ -10774,7 +10828,7 @@ void LAPACK_dsyrfsx( lapack_int const* n, lapack_int const* nrhs, double const* A, lapack_int const* lda, double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - double* S, + const double* S, double const* B, lapack_int const* ldb, double* X, lapack_int const* ldx, double* rcond, @@ -10792,7 +10846,7 @@ void LAPACK_ssyrfsx( lapack_int const* n, lapack_int const* nrhs, float const* A, lapack_int const* lda, float const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - float* S, + const float* S, float const* B, lapack_int const* ldb, float* X, lapack_int const* ldx, float* rcond, @@ -10810,7 +10864,7 @@ void LAPACK_zsyrfsx( lapack_int const* n, lapack_int const* nrhs, lapack_complex_double const* A, lapack_int const* lda, lapack_complex_double const* AF, lapack_int const* ldaf, lapack_int const* ipiv, - double* S, + const double* S, lapack_complex_double const* B, lapack_int const* ldb, lapack_complex_double* X, lapack_int const* ldx, double* rcond, @@ -11556,7 +11610,7 @@ void LAPACK_zsytrs( void LAPACK_csytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, + const lapack_complex_float* A, lapack_int const* lda, lapack_int const* ipiv, lapack_complex_float* B, lapack_int const* ldb, lapack_complex_float* work, lapack_int* info ); @@ -11565,7 +11619,7 @@ void LAPACK_csytrs2( void LAPACK_dsytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - double* A, lapack_int const* lda, lapack_int const* ipiv, + const double* A, lapack_int const* lda, lapack_int const* ipiv, double* B, lapack_int const* ldb, double* work, lapack_int* info ); @@ -11574,7 +11628,7 @@ void LAPACK_dsytrs2( void LAPACK_ssytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - float* A, lapack_int const* lda, lapack_int const* ipiv, + const float* A, lapack_int const* lda, lapack_int const* ipiv, float* B, lapack_int const* ldb, float* work, lapack_int* info ); @@ -11583,7 +11637,7 @@ void LAPACK_ssytrs2( void LAPACK_zsytrs2( char const* uplo, lapack_int const* n, lapack_int const* nrhs, - lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, + const lapack_complex_double* A, lapack_int const* lda, lapack_int const* ipiv, lapack_complex_double* B, lapack_int const* ldb, lapack_complex_double* work, lapack_int* info ); @@ -13540,6 +13594,24 @@ void LAPACK_zungtr( lapack_complex_double* work, lapack_int const* lwork, lapack_int* info ); +#define LAPACK_cungtsqr_row LAPACK_GLOBAL(cungtsqr_row,CUNGTSQR_ROW) +void LAPACK_cungtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + lapack_complex_float* A, lapack_int const* lda, + lapack_complex_float const* T, lapack_int const* ldt, + lapack_complex_float* work, lapack_int const* lwork, + lapack_int* info ); + +#define LAPACK_zungtsqr_row LAPACK_GLOBAL(zungtsqr_row,ZUNGTSQR_ROW) +void LAPACK_zungtsqr_row( + lapack_int const* m, lapack_int const* n, + lapack_int const* mb, lapack_int const* nb, + lapack_complex_double* A, lapack_int const* lda, + lapack_complex_double const* T, lapack_int const* ldt, + lapack_complex_double* work, lapack_int const* lwork, + lapack_int* info ); + #define LAPACK_cunmbr LAPACK_GLOBAL(cunmbr,CUNMBR) void LAPACK_cunmbr( char const* vect, char const* side, char const* trans, diff --git a/lapack-netlib/LAPACKE/include/lapacke.h b/lapack-netlib/LAPACKE/include/lapacke.h index 012c104bb..5c129db91 100644 --- a/lapack-netlib/LAPACKE/include/lapacke.h +++ b/lapack-netlib/LAPACKE/include/lapacke.h @@ -1867,11 +1867,11 @@ lapack_int LAPACKE_zheevx( int matrix_layout, char jobz, char range, char uplo, lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz, @@ -2598,6 +2598,15 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a, lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau ); +lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt ); +lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt ); + lapack_int LAPACKE_sormbr( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const float* a, lapack_int lda, const float* tau, @@ -4577,6 +4586,15 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n, lapack_complex_double* a, lapack_int lda, const lapack_complex_double* tau ); +lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt ); +lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt ); + lapack_int LAPACKE_cunmbr( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_float* a, lapack_int lda, @@ -6932,11 +6950,11 @@ lapack_int LAPACKE_zheevx_work( int matrix_layout, char jobz, char range, lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_chegv_work( int matrix_layout, lapack_int itype, char jobz, @@ -7880,6 +7898,19 @@ lapack_int LAPACKE_dorgtr_work( int matrix_layout, char uplo, lapack_int n, double* a, lapack_int lda, const double* tau, double* work, lapack_int lwork ); +lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt, + float* work, lapack_int lwork ); +lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt, + double* work, lapack_int lwork ); + lapack_int LAPACKE_sormbr_work( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const float* a, lapack_int lda, @@ -10281,6 +10312,19 @@ lapack_int LAPACKE_zungtr_work( int matrix_layout, char uplo, lapack_int n, const lapack_complex_double* tau, lapack_complex_double* work, lapack_int lwork ); +lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ); +lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout, + lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ); + lapack_int LAPACKE_cunmbr_work( int matrix_layout, char vect, char side, char trans, lapack_int m, lapack_int n, lapack_int k, const lapack_complex_float* a, @@ -10553,11 +10597,11 @@ lapack_int LAPACKE_csytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_float* work, lapack_int nb ); lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ); lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ); @@ -10718,10 +10762,10 @@ lapack_int LAPACKE_dsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, double* work, lapack_int nb ); lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, lapack_int lda, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ); lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ); lapack_int LAPACKE_sbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10813,10 +10857,10 @@ lapack_int LAPACKE_ssytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, float* work, lapack_int nb ); lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, lapack_int lda, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ); lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ); lapack_int LAPACKE_zbbcsd( int matrix_layout, char jobu1, char jobu2, @@ -10898,11 +10942,11 @@ lapack_int LAPACKE_zsytri2x_work( int matrix_layout, char uplo, lapack_int n, const lapack_int* ipiv, lapack_complex_double* work, lapack_int nb ); lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ); lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ); @@ -12026,6 +12070,44 @@ lapack_int LAPACKE_zgetsls_work( int matrix_layout, char trans, lapack_int m, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work, lapack_int lwork ); +lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt ); +lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt ); +lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt ); +lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt ); + +lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt, + float* work, lapack_int lwork ); +lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt, + double* work, lapack_int lwork ); +lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ); +lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ); + lapack_int LAPACKE_ssyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n, float* a, lapack_int lda, float* w ); lapack_int LAPACKE_dsyev_2stage( int matrix_layout, char jobz, char uplo, lapack_int n, diff --git a/lapack-netlib/LAPACKE/src/Makefile b/lapack-netlib/LAPACKE/src/Makefile index a602dd7a0..7f827e1c9 100644 --- a/lapack-netlib/LAPACKE/src/Makefile +++ b/lapack-netlib/LAPACKE/src/Makefile @@ -162,6 +162,8 @@ lapacke_cgetrs.o \ lapacke_cgetrs_work.o \ lapacke_cgetsls.o \ lapacke_cgetsls_work.o \ +lapacke_cgetsqrhrt.o \ +lapacke_cgetsqrhrt_work.o \ lapacke_cggbak.o \ lapacke_cggbak_work.o \ lapacke_cggbal.o \ @@ -634,6 +636,8 @@ lapacke_cungrq.o \ lapacke_cungrq_work.o \ lapacke_cungtr.o \ lapacke_cungtr_work.o \ +lapacke_cungtsqr_row.o \ +lapacke_cungtsqr_row_work.o \ lapacke_cunmbr.o \ lapacke_cunmbr_work.o \ lapacke_cunmhr.o \ @@ -778,6 +782,8 @@ lapacke_dgetrs.o \ lapacke_dgetrs_work.o \ lapacke_dgetsls.o \ lapacke_dgetsls_work.o \ +lapacke_dgetsqrhrt.o \ +lapacke_dgetsqrhrt_work.o \ lapacke_dggbak.o \ lapacke_dggbak_work.o \ lapacke_dggbal.o \ @@ -900,6 +906,8 @@ lapacke_dorgrq.o \ lapacke_dorgrq_work.o \ lapacke_dorgtr.o \ lapacke_dorgtr_work.o \ +lapacke_dorgtsqr_row.o \ +lapacke_dorgtsqr_row_work.o \ lapacke_dormbr.o \ lapacke_dormbr_work.o \ lapacke_dormhr.o \ @@ -1348,6 +1356,8 @@ lapacke_sgetrs.o \ lapacke_sgetrs_work.o \ lapacke_sgetsls.o \ lapacke_sgetsls_work.o \ +lapacke_sgetsqrhrt.o \ +lapacke_sgetsqrhrt_work.o \ lapacke_sggbak.o \ lapacke_sggbak_work.o \ lapacke_sggbal.o \ @@ -1468,6 +1478,8 @@ lapacke_sorgrq.o \ lapacke_sorgrq_work.o \ lapacke_sorgtr.o \ lapacke_sorgtr_work.o \ +lapacke_sorgtsqr_row.o \ +lapacke_sorgtsqr_row_work.o \ lapacke_sormbr.o \ lapacke_sormbr_work.o \ lapacke_sormhr.o \ @@ -1908,6 +1920,8 @@ lapacke_zgetrs.o \ lapacke_zgetrs_work.o \ lapacke_zgetsls.o \ lapacke_zgetsls_work.o \ +lapacke_zgetsqrhrt.o \ +lapacke_zgetsqrhrt_work.o \ lapacke_zggbak.o \ lapacke_zggbak_work.o \ lapacke_zggbal.o \ @@ -2380,6 +2394,8 @@ lapacke_zungrq.o \ lapacke_zungrq_work.o \ lapacke_zungtr.o \ lapacke_zungtr_work.o \ +lapacke_zungtsqr_row.o \ +lapacke_zungtsqr_row_work.o \ lapacke_zunmbr.o \ lapacke_zunmbr_work.o \ lapacke_zunmhr.o \ diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c index 558a7f308..4256c0f04 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cgesvd_work.c @@ -56,6 +56,8 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +75,7 @@ lapack_int LAPACKE_cgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_cgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c new file mode 100644 index 000000000..0e67e0b83 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt.c @@ -0,0 +1,80 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_float* work = NULL; + lapack_complex_float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_C2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_cgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c new file mode 100644 index 000000000..598f193e6 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cgetsqrhrt_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_float* a, lapack_int lda, + lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + lapack_complex_float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_cge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c index aa78e678e..dbb2753d1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheev_work.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_cheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c index d26c84785..2f25c187a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_2stage_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c index e8f212efb..9e8a1c4db 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cheevd_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_cheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_cge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_che_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst.c b/lapack-netlib/LAPACKE/src/lapacke_chegst.c index ff7dd3532..c628017c2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c index a29e01961..001863819 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_chegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_float* a, - lapack_int lda, lapack_complex_float* b, + lapack_int lda, const lapack_complex_float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv.c b/lapack-netlib/LAPACKE/src/lapacke_chegv.c index 15d052987..c01525662 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegv.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c index 537b9450b..fc3395833 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegv_2stage.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_chegv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c index 98c901982..fe7b39cee 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegvd.c @@ -55,10 +55,10 @@ lapack_int LAPACKE_chegvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c index 3ba62746e..d56e3ee46 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chegvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chegvx.c @@ -60,7 +60,7 @@ lapack_int LAPACKE_chegvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_cge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c index 6937752c4..fc0d4e3d2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c +++ b/lapack-netlib/LAPACKE/src/lapacke_chetri2x.c @@ -46,7 +46,7 @@ lapack_int LAPACKE_chetri2x( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c index 80d262626..eba359312 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clacpy_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_clacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_clacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c index 8c4c21935..4779f10d2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clantr_work.c @@ -41,45 +41,46 @@ float LAPACKE_clantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; float res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - lapack_complex_float* a_t = NULL; float* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_clantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (lapack_complex_float*) - LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_ctr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_clantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_clantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_clantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_clascl.c b/lapack-netlib/LAPACKE/src/lapacke_clascl.c index fdcb02947..4f4e0bf35 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_clascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_clascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_clascl( int matrix_layout, char type, lapack_int kl, LAPACKE_cgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_chb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c index 7b25815e7..1b4fed17a 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_claset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_claset_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_claset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_claset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_float* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c index 2eb942e4e..771395e97 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csyconv.c @@ -45,7 +45,7 @@ lapack_int LAPACKE_csyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_csy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c index 44405c993..f4a0a4334 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c index 8567a07d5..d914c1d69 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_csytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_csytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_float* a, + lapack_int nrhs, const lapack_complex_float* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_float* b, lapack_int ldb, lapack_complex_float* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c index fd0a40c17..8ca652456 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c index c4ea703af..7b2e3a169 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ctrttp.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ctrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ctr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c index ddae70345..faa3ef6d3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cungtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtr.c @@ -48,7 +48,7 @@ lapack_int LAPACKE_cungtr( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_cge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_c_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c new file mode 100644 index 000000000..bb551fcbc --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function cungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_float* work = NULL; + lapack_complex_float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_cungtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_cge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_cge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_C2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_cungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cungtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c new file mode 100644 index 000000000..96b18ab13 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_cungtsqr_row_work.c @@ -0,0 +1,109 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function cungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_cungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_float* a, lapack_int lda, + const lapack_complex_float* t, lapack_int ldt, + lapack_complex_float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + lapack_complex_float* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + lapack_complex_float* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_float*) + LAPACKE_malloc( sizeof(lapack_complex_float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_cge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_cge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_cungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_cge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_cungtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c index d9fb2dca0..71ad23f2f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_cunmtr.c @@ -52,7 +52,7 @@ lapack_int LAPACKE_cunmtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_cge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_che_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_cge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c index 7dbc9bb88..671def1df 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dgesvd_work.c @@ -54,6 +54,8 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -71,7 +73,7 @@ lapack_int LAPACKE_dgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_dgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c new file mode 100644 index 000000000..cf0e3200c --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt.c @@ -0,0 +1,79 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + double* work = NULL; + double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_dgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c new file mode 100644 index 000000000..f91887ffe --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dgetsqrhrt_work.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + double* a, lapack_int lda, + double* t, lapack_int ldt, + double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + double* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_dge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c index f1a505486..88f4489a3 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlacpy_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_dlacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dlacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c index 5b2a6c535..9c9b0ea8b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlantr_work.c @@ -40,44 +40,46 @@ double LAPACKE_dlantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - double* a_t = NULL; double* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_dlantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_dtr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_dlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_dlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_dlantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c index 5b579a5d1..058105127 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_dlascl( int matrix_layout, char type, lapack_int kl, LAPACKE_dgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_dsb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c index 4b59fe627..f1444b5e2 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dlaset_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_dlaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_dlaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); double* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c index 86184b784..587805de6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtr.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_dorgtr( int matrix_layout, char uplo, lapack_int n, double* a #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_d_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c new file mode 100644 index 000000000..1da3405a8 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row.c @@ -0,0 +1,82 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function dorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + double* work = NULL; + double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_dge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_dge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (double*)LAPACKE_malloc( sizeof(double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_dorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c new file mode 100644 index 000000000..e16467f3a --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_dorgtsqr_row_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function dorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_dorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + double* a, lapack_int lda, + const double* t, lapack_int ldt, + double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + double* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + double* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (double*)LAPACKE_malloc( sizeof(double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (double*)LAPACKE_malloc( sizeof(double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_dge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_dge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_dorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_dge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_dorgtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c index db75a6609..0b1c54b9b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dormtr.c @@ -51,7 +51,7 @@ lapack_int LAPACKE_dormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_dge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_dge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c index cca9be489..36ff7c40c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyconv.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_dsyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c index f696c608f..78f9e80ed 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyev_work.c @@ -72,7 +72,7 @@ lapack_int LAPACKE_dsyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c index 6f9c02f6a..d68989aa6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_2stage_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c index 81ba2acb3..25d075d46 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsyevd_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_dsyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_dge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_dsy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c index 800a30b24..69b90e758 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygst.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_dsygst( int matrix_layout, lapack_int itype, char uplo, if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c index 533b6a446..4ece69794 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c index 974b63e54..0016a7d06 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygv_2stage.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_dsygv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c index 51f333359..0db0cfa67 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvd.c @@ -51,10 +51,10 @@ lapack_int LAPACKE_dsygvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c index 02d54d7fa..54fa6ff36 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsygvx.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_dsygvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_dge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_dsy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c index 4d73ef3c1..46c90190f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, lapack_int lda, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c index caffa5b4b..c937c39c5 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dsytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_dsytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, double* a, + lapack_int nrhs, const double* a, lapack_int lda, const lapack_int* ipiv, double* b, lapack_int ldb, double* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c index 66d1e5a2c..de379a970 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_dtrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c index 89f01dc95..d17593471 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_dtrttp.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_dtrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_dge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_dtr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c index 9dc5509c9..941d83cad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sgesvd_work.c @@ -54,6 +54,8 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -71,7 +73,7 @@ lapack_int LAPACKE_sgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_sgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c new file mode 100644 index 000000000..759afce48 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt.c @@ -0,0 +1,79 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + float* work = NULL; + float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_sgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c new file mode 100644 index 000000000..40193008d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sgetsqrhrt_work.c @@ -0,0 +1,106 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + float* a, lapack_int lda, + float* t, lapack_int ldt, + float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + float* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + float* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_sge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c index e60167001..cdec2c967 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slacpy_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_slacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_slacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c index e1d4c270d..f77abef2c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slantr_work.c @@ -40,44 +40,46 @@ float LAPACKE_slantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; float res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - float* a_t = NULL; float* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_slantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (float*)LAPACKE_malloc( sizeof(float) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_str_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_slantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_slantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_slantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_slascl.c b/lapack-netlib/LAPACKE/src/lapacke_slascl.c index 25bd9624e..62f7390ed 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_slascl( int matrix_layout, char type, lapack_int kl, LAPACKE_sgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_ssb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c index c89c9a6e1..4f2fa7b67 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_slaset_work.c @@ -41,9 +41,6 @@ lapack_int LAPACKE_slaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_slaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); float* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c index 90dc435c9..804b7f8ef 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtr.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_sorgtr( int matrix_layout, char uplo, lapack_int n, float* a, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_s_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c new file mode 100644 index 000000000..350783a78 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row.c @@ -0,0 +1,82 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function sorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorgtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + float* work = NULL; + float work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_sge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_sge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = (lapack_int)work_query; + /* Allocate memory for work arrays */ + work = (float*)LAPACKE_malloc( sizeof(float) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_sorgtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c new file mode 100644 index 000000000..a66f70b52 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_sorgtsqr_row_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function sorgtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_sorgtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + float* a, lapack_int lda, + const float* t, lapack_int ldt, + float* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + float* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + float* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (float*)LAPACKE_malloc( sizeof(float) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (float*)LAPACKE_malloc( sizeof(float) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_sge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_sge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_sorgtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_sge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_sorgtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c index 9f0e9fddf..6ffe144cc 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_sormtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_sormtr.c @@ -51,7 +51,7 @@ lapack_int LAPACKE_sormtr( int matrix_layout, char side, char uplo, char trans, if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ r = LAPACKE_lsame( side, 'l' ) ? m : n; - if( LAPACKE_sge_nancheck( matrix_layout, r, r, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, r, a, lda ) ) { return -7; } if( LAPACKE_sge_nancheck( matrix_layout, m, n, c, ldc ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c index 5fd0a78c5..ac41a354d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyconv.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_ssyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c index abd62ddf3..1889a337c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyev_work.c @@ -72,7 +72,7 @@ lapack_int LAPACKE_ssyev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c index d9fe47599..faadc92f1 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_2stage_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c index bfbf49aee..434b52c01 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssyevd_work.c @@ -76,7 +76,7 @@ lapack_int LAPACKE_ssyevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_sge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_ssy_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c index 7b97f472b..4fb55960c 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygst.c @@ -47,7 +47,7 @@ lapack_int LAPACKE_ssygst( int matrix_layout, lapack_int itype, char uplo, if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -7; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c index 8ec40d954..f139de1ab 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c index a2eba6653..195fb1e54 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygv_2stage.c @@ -48,10 +48,10 @@ lapack_int LAPACKE_ssygv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c index 5afe8d2de..e33ce2a7b 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvd.c @@ -51,10 +51,10 @@ lapack_int LAPACKE_ssygvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c index 1fe4e2c6c..8ffd9dc40 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssygvx.c @@ -58,7 +58,7 @@ lapack_int LAPACKE_ssygvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_s_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_sge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_ssy_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c index 19f447cd8..a95a71469 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, lapack_int lda, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c index 7d348b382..cf98f443d 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ssytrs2_work.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_ssytrs2_work( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, float* a, + lapack_int nrhs, const float* a, lapack_int lda, const lapack_int* ipiv, float* b, lapack_int ldb, float* work ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttf.c b/lapack-netlib/LAPACKE/src/lapacke_strttf.c index fee7ab9ae..e3304fbe7 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_strttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_strttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_strttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_strttp.c b/lapack-netlib/LAPACKE/src/lapacke_strttp.c index 6c4b84aa3..2df79eb05 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_strttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_strttp.c @@ -43,7 +43,7 @@ lapack_int LAPACKE_strttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_sge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_str_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c index 2d7c2b6f3..da73cd479 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zgesvd_work.c @@ -56,6 +56,8 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt, ( LAPACKE_lsame( jobu, 's' ) ? MIN(m,n) : 1); lapack_int nrows_vt = LAPACKE_lsame( jobvt, 'a' ) ? n : ( LAPACKE_lsame( jobvt, 's' ) ? MIN(m,n) : 1); + lapack_int ncols_vt = ( LAPACKE_lsame( jobvt, 'a' ) || + LAPACKE_lsame( jobvt, 's' ) ) ? n : 1; lapack_int lda_t = MAX(1,m); lapack_int ldu_t = MAX(1,nrows_u); lapack_int ldvt_t = MAX(1,nrows_vt); @@ -73,7 +75,7 @@ lapack_int LAPACKE_zgesvd_work( int matrix_layout, char jobu, char jobvt, LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; } - if( ldvt < n ) { + if( ldvt < ncols_vt ) { info = -12; LAPACKE_xerbla( "LAPACKE_zgesvd_work", info ); return info; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c new file mode 100644 index 000000000..53557c92d --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt.c @@ -0,0 +1,80 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgetsqrhrt( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_double* work = NULL; + lapack_complex_double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -7; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_zgetsqrhrt_work( matrix_layout, m, n, mb1, nb1, nb2, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c new file mode 100644 index 000000000..a6825df56 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zgetsqrhrt_work.c @@ -0,0 +1,108 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zgetsqrhrt +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zgetsqrhrt_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb1, lapack_int nb1, lapack_int nb2, + lapack_complex_double* a, lapack_int lda, + lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if( matrix_layout == LAPACK_COL_MAJOR ) { + /* Call LAPACK function and adjust info */ + LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda, t, &ldt, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + } else if( matrix_layout == LAPACK_ROW_MAJOR ) { + lapack_int lda_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + lapack_int ldt_t = MAX(1,nb2); + lapack_complex_double* t_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -8; + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + return info; + } + if( ldt < n ) { + info = -10; + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zgetsqrhrt( &m, &n, &mb1, &nb1, &nb2, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + LAPACKE_zge_trans( LAPACK_COL_MAJOR, nb2, n, t_t, ldt_t, t, ldt ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zgetsqrhrt_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c index d4e93aed2..8b7aa3518 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheev_work.c @@ -78,7 +78,7 @@ lapack_int LAPACKE_zheev_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c index fb33c3e2a..840c53876 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_2stage_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_2stage_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c index 5af2a1269..b8509e04f 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zheevd_work.c @@ -79,7 +79,7 @@ lapack_int LAPACKE_zheevd_work( int matrix_layout, char jobz, char uplo, info = info - 1; } /* Transpose output matrices */ - if ( jobz == 'V') { + if ( jobz == 'V' || jobz == 'v' ) { LAPACKE_zge_trans( LAPACK_COL_MAJOR, n, n, a_t, lda_t, a, lda ); } else { LAPACKE_zhe_trans( LAPACK_COL_MAJOR, uplo, n, a_t, lda_t, a, lda ); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c index 8c4a5c374..aa2d84d84 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ) { if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c index 62fce1f27..f77894204 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegst_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zhegst_work( int matrix_layout, lapack_int itype, char uplo, lapack_int n, lapack_complex_double* a, - lapack_int lda, lapack_complex_double* b, + lapack_int lda, const lapack_complex_double* b, lapack_int ldb ) { lapack_int info = 0; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c index 683fcf487..587e2d4be 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c index 0f1b415a9..43569d99e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegv_2stage.c @@ -50,10 +50,10 @@ lapack_int LAPACKE_zhegv_2stage( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c index 1242a0eda..c287595ad 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvd.c @@ -55,10 +55,10 @@ lapack_int LAPACKE_zhegvd( int matrix_layout, lapack_int itype, char jobz, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -6; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -8; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c index 492bc4dad..83f2bda2e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhegvx.c @@ -61,7 +61,7 @@ lapack_int LAPACKE_zhegvx( int matrix_layout, lapack_int itype, char jobz, if( LAPACKE_d_nancheck( 1, &abstol, 1 ) ) { return -15; } - if( LAPACKE_zge_nancheck( matrix_layout, n, n, b, ldb ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, b, ldb ) ) { return -9; } if( LAPACKE_lsame( range, 'v' ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c index a07bc8d52..15a8cc576 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zhetri2x.c @@ -46,7 +46,7 @@ lapack_int LAPACKE_zhetri2x( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c index bb4e57b1e..fe36ed811 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlacpy_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_zlacpy_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zlacpy( &uplo, &m, &n, a, &lda, b, &ldb ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_int ldb_t = MAX(1,m); diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c index e62f8a4e3..cccc4053e 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlantr_work.c @@ -41,45 +41,46 @@ double LAPACKE_zlantr_work( int matrix_layout, char norm, char uplo, lapack_int info = 0; double res = 0.; if( matrix_layout == LAPACK_COL_MAJOR ) { - /* Call LAPACK function and adjust info */ + /* Call LAPACK function */ res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a, &lda, work ); } else if( matrix_layout == LAPACK_ROW_MAJOR ) { - lapack_int lda_t = MAX(1,m); - lapack_complex_double* a_t = NULL; double* work_lapack = NULL; + char norm_lapack; + char uplo_lapack; /* Check leading dimension(s) */ if( lda < n ) { info = -8; LAPACKE_xerbla( "LAPACKE_zlantr_work", info ); return info; } - /* Allocate memory for temporary array(s) */ - a_t = (lapack_complex_double*) - LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,MAX(m,n)) ); - if( a_t == NULL ) { - info = LAPACK_TRANSPOSE_MEMORY_ERROR; - goto exit_level_0; + if( LAPACKE_lsame( norm, '1' ) || LAPACKE_lsame( norm, 'o' ) ) { + norm_lapack = 'i'; + } else if( LAPACKE_lsame( norm, 'i' ) ) { + norm_lapack = '1'; + } else { + norm_lapack = norm; + } + if( LAPACKE_lsame( uplo, 'u' ) ) { + uplo_lapack = 'l'; + } else { + uplo_lapack = 'u'; } /* Allocate memory for work array(s) */ - if( LAPACKE_lsame( norm, 'i' ) ) { - work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,m) ); + if( LAPACKE_lsame( norm_lapack, 'i' ) ) { + work_lapack = (double*)LAPACKE_malloc( sizeof(double) * MAX(1,n) ); if( work_lapack == NULL ) { info = LAPACK_WORK_MEMORY_ERROR; - goto exit_level_1; + goto exit_level_0; } } - /* Transpose input matrices */ - LAPACKE_ztr_trans( matrix_layout, uplo, diag, MAX(m,n), a, lda, a_t, lda_t ); - /* Call LAPACK function and adjust info */ - res = LAPACK_zlantr( &norm, &uplo, &diag, &m, &n, a_t, &lda_t, work_lapack ); + /* Call LAPACK function */ + res = LAPACK_zlantr( &norm_lapack, &uplo_lapack, &diag, &n, &m, a, &lda, work_lapack ); /* Release memory and exit */ if( work_lapack ) { LAPACKE_free( work_lapack ); } -exit_level_1: - LAPACKE_free( a_t ); exit_level_0: - if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + if( info == LAPACK_WORK_MEMORY_ERROR ) { LAPACKE_xerbla( "LAPACKE_zlantr_work", info ); } } else { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c index 7e37d559c..8bf1ee767 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlascl.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlascl.c @@ -83,6 +83,7 @@ lapack_int LAPACKE_zlascl( int matrix_layout, char type, lapack_int kl, LAPACKE_zgb_nancheck( LAPACK_COL_MAJOR, n, m, n-1, 1, a-1, lda+1 ) ) { return -9; } + break; case 'B': // TYPE = 'B' - lower part of symmetric band matrix (assume m==n) if( LAPACKE_zhb_nancheck( matrix_layout, 'L', n, kl, a, lda ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c index 9056e8fca..ecb6cba25 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zlaset_work.c @@ -42,9 +42,6 @@ lapack_int LAPACKE_zlaset_work( int matrix_layout, char uplo, lapack_int m, if( matrix_layout == LAPACK_COL_MAJOR ) { /* Call LAPACK function and adjust info */ LAPACK_zlaset( &uplo, &m, &n, &alpha, &beta, a, &lda ); - if( info < 0 ) { - info = info - 1; - } } else if( matrix_layout == LAPACK_ROW_MAJOR ) { lapack_int lda_t = MAX(1,m); lapack_complex_double* a_t = NULL; diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c index 2826efa53..074b15303 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsyconv.c @@ -45,7 +45,7 @@ lapack_int LAPACKE_zsyconv( int matrix_layout, char uplo, char way, lapack_int n #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zsy_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c index 7442702aa..3c85f9796 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2.c @@ -34,7 +34,7 @@ #include "lapacke_utils.h" lapack_int LAPACKE_zsytrs2( int matrix_layout, char uplo, lapack_int n, - lapack_int nrhs, lapack_complex_double* a, + lapack_int nrhs, const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c index ec05ce6d5..cdc97fa02 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zsytrs2_work.c @@ -35,7 +35,7 @@ lapack_int LAPACKE_zsytrs2_work( int matrix_layout, char uplo, lapack_int n, lapack_int nrhs, - lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* a, lapack_int lda, const lapack_int* ipiv, lapack_complex_double* b, lapack_int ldb, lapack_complex_double* work ) diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c index 8a5dfc271..8e8789ec6 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttf.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttf( int matrix_layout, char transr, char uplo, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -5; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c index 5dcf633bb..bd8485108 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c +++ b/lapack-netlib/LAPACKE/src/lapacke_ztrttp.c @@ -44,7 +44,7 @@ lapack_int LAPACKE_ztrttp( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_ztr_nancheck( matrix_layout, uplo, 'n', n, a, lda ) ) { return -4; } } diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c index 51785347e..adfaa7db9 100644 --- a/lapack-netlib/LAPACKE/src/lapacke_zungtr.c +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtr.c @@ -48,7 +48,7 @@ lapack_int LAPACKE_zungtr( int matrix_layout, char uplo, lapack_int n, #ifndef LAPACK_DISABLE_NAN_CHECK if( LAPACKE_get_nancheck() ) { /* Optionally check input matrices for NaNs */ - if( LAPACKE_zge_nancheck( matrix_layout, n, n, a, lda ) ) { + if( LAPACKE_zhe_nancheck( matrix_layout, uplo, n, a, lda ) ) { return -4; } if( LAPACKE_z_nancheck( n-1, tau, 1 ) ) { diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c new file mode 100644 index 000000000..71418fb84 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row.c @@ -0,0 +1,83 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native high-level C interface to LAPACK function zungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zungtsqr_row( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt ) +{ + lapack_int info = 0; + lapack_int lwork = -1; + lapack_complex_double* work = NULL; + lapack_complex_double work_query; + if( matrix_layout != LAPACK_COL_MAJOR && matrix_layout != LAPACK_ROW_MAJOR ) { + LAPACKE_xerbla( "LAPACKE_zungtsqr_row", -1 ); + return -1; + } +#ifndef LAPACK_DISABLE_NAN_CHECK + if( LAPACKE_get_nancheck() ) { + /* Optionally check input matrices for NaNs */ + if( LAPACKE_zge_nancheck( matrix_layout, m, n, a, lda ) ) { + return -6; + } + if( LAPACKE_zge_nancheck( matrix_layout, nb, n, t, ldt ) ) { + return -8; + } + } +#endif + /* Query optimal working array(s) size */ + info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, &work_query, lwork ); + if( info != 0 ) { + goto exit_level_0; + } + lwork = LAPACK_Z2INT( work_query ); + /* Allocate memory for work arrays */ + work = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lwork ); + if( work == NULL ) { + info = LAPACK_WORK_MEMORY_ERROR; + goto exit_level_0; + } + /* Call middle-level interface */ + info = LAPACKE_zungtsqr_row_work( matrix_layout, m, n, mb, nb, + a, lda, t, ldt, work, lwork ); + /* Release memory and exit */ + LAPACKE_free( work ); +exit_level_0: + if( info == LAPACK_WORK_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zungtsqr_row", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c new file mode 100644 index 000000000..909855864 --- /dev/null +++ b/lapack-netlib/LAPACKE/src/lapacke_zungtsqr_row_work.c @@ -0,0 +1,109 @@ +/***************************************************************************** + Copyright (c) 2020, Intel Corp. + All rights reserved. + + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are met: + + * Redistributions of source code must retain the above copyright notice, + this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + * Neither the name of Intel Corporation nor the names of its contributors + may be used to endorse or promote products derived from this software + without specific prior written permission. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE + LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + THE POSSIBILITY OF SUCH DAMAGE. +***************************************************************************** +* Contents: Native middle-level C interface to LAPACK function zungtsqr_row +* Author: Intel Corporation +*****************************************************************************/ + +#include "lapacke_utils.h" + +lapack_int LAPACKE_zungtsqr_row_work( int matrix_layout, lapack_int m, lapack_int n, + lapack_int mb, lapack_int nb, + lapack_complex_double* a, lapack_int lda, + const lapack_complex_double* t, lapack_int ldt, + lapack_complex_double* work, lapack_int lwork ) +{ + lapack_int info = 0; + if (matrix_layout == LAPACK_COL_MAJOR) { + /* Call LAPACK function and adjust info */ + LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda, t, &ldt, + work, &lwork, &info); + if (info < 0) { + info = info - 1; + } + } else if (matrix_layout == LAPACK_ROW_MAJOR) { + lapack_int lda_t = MAX(1,m); + lapack_complex_double* a_t = NULL; + /* Check leading dimension(s) */ + if( lda < n ) { + info = -7; + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + return info; + } + lapack_int ldt_t = MAX(1,nb); + lapack_complex_double* t_t = NULL; + /* Check leading dimension(s) */ + if( ldt < n ) { + info = -9; + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + return info; + } + /* Query optimal working array(s) size if requested */ + if( lwork == -1 ) { + LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a, &lda_t, t, &ldt_t, + work, &lwork, &info ); + return (info < 0) ? (info - 1) : info; + } + /* Allocate memory for temporary array(s) */ + a_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * lda_t * MAX(1,n) ); + if( a_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_0; + } + t_t = (lapack_complex_double*) + LAPACKE_malloc( sizeof(lapack_complex_double) * ldt_t * MAX(1,n) ); + if( t_t == NULL ) { + info = LAPACK_TRANSPOSE_MEMORY_ERROR; + goto exit_level_1; + } + /* Transpose input matrices */ + LAPACKE_zge_trans( matrix_layout, m, n, a, lda, a_t, lda_t ); + LAPACKE_zge_trans( matrix_layout, nb, n, a, lda, t_t, ldt_t ); + /* Call LAPACK function and adjust info */ + LAPACK_zungtsqr_row( &m, &n, &mb, &nb, a_t, &lda_t, t_t, &ldt_t, + work, &lwork, &info ); + if( info < 0 ) { + info = info - 1; + } + /* Transpose output matrices */ + LAPACKE_zge_trans( LAPACK_COL_MAJOR, m, n, a_t, lda_t, a, lda ); + /* Release memory and exit */ + LAPACKE_free( t_t ); +exit_level_1: + LAPACKE_free( a_t ); +exit_level_0: + if( info == LAPACK_TRANSPOSE_MEMORY_ERROR ) { + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + } + } else { + info = -1; + LAPACKE_xerbla( "LAPACKE_zungtsqr_row_work", info ); + } + return info; +} \ No newline at end of file diff --git a/lapack-netlib/SRC/Makefile b/lapack-netlib/SRC/Makefile index 83baac875..d1ee96667 100644 --- a/lapack-netlib/SRC/Makefile +++ b/lapack-netlib/SRC/Makefile @@ -135,14 +135,14 @@ SLASRC_O = \ slaqgb.o slaqge.o slaqp2.o slaqps.o slaqsb.o slaqsp.o slaqsy.o \ slaqr0.o slaqr1.o slaqr2.o slaqr3.o slaqr4.o slaqr5.o \ slaqtr.o slar1v.o slar2v.o ilaslr.o ilaslc.o \ - slarf.o slarfb.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ + slarf.o slarfb.o slarfb_gett.o slarfg.o slarfgp.o slarft.o slarfx.o slarfy.o slargv.o \ slarrv.o slartv.o \ slarz.o slarzb.o slarzt.o slaswp.o slasy2.o slasyf.o slasyf_rook.o \ slasyf_rk.o \ slatbs.o slatdf.o slatps.o slatrd.o slatrs.o slatrz.o \ slauu2.o slauum.o sopgtr.o sopmtr.o sorg2l.o sorg2r.o \ sorgbr.o sorghr.o sorgl2.o sorglq.o sorgql.o sorgqr.o sorgr2.o \ - sorgrq.o sorgtr.o sorgtsqr.o sorm2l.o sorm2r.o sorm22.o \ + sorgrq.o sorgtr.o sorgtsqr.o sorgtsqr_row.o sorm2l.o sorm2r.o sorm22.o \ sormbr.o sormhr.o sorml2.o sormlq.o sormql.o sormqr.o sormr2.o \ sormr3.o sormrq.o sormrz.o sormtr.o spbcon.o spbequ.o spbrfs.o \ spbstf.o spbsv.o spbsvx.o \ @@ -181,7 +181,7 @@ SLASRC_O = \ sgeqrt.o sgeqrt2.o sgeqrt3.o sgemqrt.o \ stpqrt.o stpqrt2.o stpmqrt.o stprfb.o \ sgelqt.o sgelqt3.o sgemlqt.o \ - sgetsls.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \ + sgetsls.o sgetsqrhrt.o sgeqr.o slatsqr.o slamtsqr.o sgemqr.o \ sgelq.o slaswlq.o slamswlq.o sgemlq.o \ stplqt.o stplqt2.o stpmlqt.o \ sorhr_col.o slaorhr_col_getrfnp.o slaorhr_col_getrfnp2.o \ @@ -250,7 +250,7 @@ CLASRC_O = \ claqhb.o claqhe.o claqhp.o claqp2.o claqps.o claqsb.o \ claqr0.o claqr1.o claqr2.o claqr3.o claqr4.o claqr5.o \ claqsp.o claqsy.o clar1v.o clar2v.o ilaclr.o ilaclc.o \ - clarf.o clarfb.o clarfg.o clarft.o clarfgp.o \ + clarf.o clarfb.o clarfb_gett.o clarfg.o clarft.o clarfgp.o \ clarfx.o clarfy.o clargv.o clarnv.o clarrv.o clartg.o clartv.o \ clarz.o clarzb.o clarzt.o clascl.o claset.o clasr.o classq.o \ claswp.o clasyf.o clasyf_rook.o clasyf_rk.o clasyf_aa.o \ @@ -278,7 +278,7 @@ CLASRC_O = \ ctptrs.o ctrcon.o ctrevc.o ctrevc3.o ctrexc.o ctrrfs.o ctrsen.o ctrsna.o \ ctrsyl.o ctrti2.o ctrtri.o ctrtrs.o ctzrzf.o cung2l.o cung2r.o \ cungbr.o cunghr.o cungl2.o cunglq.o cungql.o cungqr.o cungr2.o \ - cungrq.o cungtr.o cungtsqr.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ + cungrq.o cungtr.o cungtsqr.o cungtsqr_row.o cunm2l.o cunm2r.o cunmbr.o cunmhr.o cunml2.o cunm22.o \ cunmlq.o cunmql.o cunmqr.o cunmr2.o cunmr3.o cunmrq.o cunmrz.o \ cunmtr.o cupgtr.o cupmtr.o icmax1.o scsum1.o cstemr.o \ chfrk.o ctfttp.o clanhf.o cpftrf.o cpftri.o cpftrs.o ctfsm.o ctftri.o \ @@ -289,7 +289,7 @@ CLASRC_O = \ cgeqrt.o cgeqrt2.o cgeqrt3.o cgemqrt.o \ ctpqrt.o ctpqrt2.o ctpmqrt.o ctprfb.o \ cgelqt.o cgelqt3.o cgemlqt.o \ - cgetsls.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \ + cgetsls.o cgetsqrhrt.o cgeqr.o clatsqr.o clamtsqr.o cgemqr.o \ cgelq.o claswlq.o clamswlq.o cgemlq.o \ ctplqt.o ctplqt2.o ctpmlqt.o \ cunhr_col.o claunhr_col_getrfnp.o claunhr_col_getrfnp2.o \ @@ -342,14 +342,14 @@ DLASRC_O = \ dlaqgb.o dlaqge.o dlaqp2.o dlaqps.o dlaqsb.o dlaqsp.o dlaqsy.o \ dlaqr0.o dlaqr1.o dlaqr2.o dlaqr3.o dlaqr4.o dlaqr5.o \ dlaqtr.o dlar1v.o dlar2v.o iladlr.o iladlc.o \ - dlarf.o dlarfb.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ + dlarf.o dlarfb.o dlarfb_gett.o dlarfg.o dlarfgp.o dlarft.o dlarfx.o dlarfy.o \ dlargv.o dlarrv.o dlartv.o \ dlarz.o dlarzb.o dlarzt.o dlaswp.o dlasy2.o \ dlasyf.o dlasyf_rook.o dlasyf_rk.o \ dlatbs.o dlatdf.o dlatps.o dlatrd.o dlatrs.o dlatrz.o dlauu2.o \ dlauum.o dopgtr.o dopmtr.o dorg2l.o dorg2r.o \ dorgbr.o dorghr.o dorgl2.o dorglq.o dorgql.o dorgqr.o dorgr2.o \ - dorgrq.o dorgtr.o dorgtsqr.o dorm2l.o dorm2r.o dorm22.o \ + dorgrq.o dorgtr.o dorgtsqr.o dorgtsqr_row.o dorm2l.o dorm2r.o dorm22.o \ dormbr.o dormhr.o dorml2.o dormlq.o dormql.o dormqr.o dormr2.o \ dormr3.o dormrq.o dormrz.o dormtr.o dpbcon.o dpbequ.o dpbrfs.o \ dpbstf.o dpbsv.o dpbsvx.o \ @@ -389,7 +389,7 @@ DLASRC_O = \ dgeqrt.o dgeqrt2.o dgeqrt3.o dgemqrt.o \ dtpqrt.o dtpqrt2.o dtpmqrt.o dtprfb.o \ dgelqt.o dgelqt3.o dgemlqt.o \ - dgetsls.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \ + dgetsls.o dgetsqrhrt.o dgeqr.o dlatsqr.o dlamtsqr.o dgemqr.o \ dgelq.o dlaswlq.o dlamswlq.o dgemlq.o \ dtplqt.o dtplqt2.o dtpmlqt.o \ dorhr_col.o dlaorhr_col_getrfnp.o dlaorhr_col_getrfnp2.o \ @@ -455,7 +455,7 @@ ZLASRC_O = \ zlaqhb.o zlaqhe.o zlaqhp.o zlaqp2.o zlaqps.o zlaqsb.o \ zlaqr0.o zlaqr1.o zlaqr2.o zlaqr3.o zlaqr4.o zlaqr5.o \ zlaqsp.o zlaqsy.o zlar1v.o zlar2v.o ilazlr.o ilazlc.o \ - zlarcm.o zlarf.o zlarfb.o \ + zlarcm.o zlarf.o zlarfb.o zlarfb_gett.o \ zlarfg.o zlarft.o zlarfgp.o \ zlarfx.o zlarfy.o zlargv.o zlarnv.o zlarrv.o zlartg.o zlartv.o \ zlarz.o zlarzb.o zlarzt.o zlascl.o zlaset.o zlasr.o \ @@ -484,7 +484,7 @@ ZLASRC_O = \ ztptrs.o ztrcon.o ztrevc.o ztrevc3.o ztrexc.o ztrrfs.o ztrsen.o ztrsna.o \ ztrsyl.o ztrti2.o ztrtri.o ztrtrs.o ztzrzf.o zung2l.o \ zung2r.o zungbr.o zunghr.o zungl2.o zunglq.o zungql.o zungqr.o zungr2.o \ - zungrq.o zungtr.o zungtsqr.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ + zungrq.o zungtr.o zungtsqr.o zungtsqr_row.o zunm2l.o zunm2r.o zunmbr.o zunmhr.o zunml2.o zunm22.o \ zunmlq.o zunmql.o zunmqr.o zunmr2.o zunmr3.o zunmrq.o zunmrz.o \ zunmtr.o zupgtr.o \ zupmtr.o izmax1.o dzsum1.o zstemr.o \ @@ -498,7 +498,7 @@ ZLASRC_O = \ ztpqrt.o ztpqrt2.o ztpmqrt.o ztprfb.o \ ztplqt.o ztplqt2.o ztpmlqt.o \ zgelqt.o zgelqt3.o zgemlqt.o \ - zgetsls.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \ + zgetsls.o zgetsqrhrt.o zgeqr.o zlatsqr.o zlamtsqr.o zgemqr.o \ zgelq.o zlaswlq.o zlamswlq.o zgemlq.o \ zunhr_col.o zlaunhr_col_getrfnp.o zlaunhr_col_getrfnp2.o \ zhetrd_2stage.o zhetrd_he2hb.o zhetrd_hb2st.o zhb2st_kernels.o \ diff --git a/lapack-netlib/SRC/cgesdd.f b/lapack-netlib/SRC/cgesdd.f index 07341593f..34a80beea 100644 --- a/lapack-netlib/SRC/cgesdd.f +++ b/lapack-netlib/SRC/cgesdd.f @@ -281,9 +281,9 @@ $ CUNGQR, CUNMBR, SBDSDC, SLASCL, XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN REAL SLAMCH, CLANGE - EXTERNAL LSAME, SLAMCH, CLANGE + EXTERNAL LSAME, SLAMCH, CLANGE, SISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -647,6 +647,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = CLANGE( 'M', M, N, A, LDA, DUM ) + IF( SISNAN ( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/cgetsqrhrt.f b/lapack-netlib/SRC/cgetsqrhrt.f new file mode 100644 index 000000000..4e4dc1d4a --- /dev/null +++ b/lapack-netlib/SRC/cgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b CGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a complex M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in CGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of CGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup comlpexOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CLATSQR, CUNGTSQR_ROW, CUNHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for CLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for CUNGTSQR_ROW; +* d) Diagonal D for CUNHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of CUNGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL CLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL CCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL CUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL CUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the CUNHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN + DO J = I, N + A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL CCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN +* +* End of CGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/cggglm.f b/lapack-netlib/SRC/cggglm.f index 336f41909..9c8e0eec3 100644 --- a/lapack-netlib/SRC/cggglm.f +++ b/lapack-netlib/SRC/cggglm.f @@ -271,8 +271,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = CZERO + END DO + DO I = 1, P + Y(I) = CZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/chgeqz.f b/lapack-netlib/SRC/chgeqz.f index 4725e7169..bcf5acd0b 100644 --- a/lapack-netlib/SRC/chgeqz.f +++ b/lapack-netlib/SRC/chgeqz.f @@ -319,14 +319,14 @@ REAL ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, - $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, + $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, $ U12, X, ABI12, Y * .. * .. External Functions .. COMPLEX CLADIV LOGICAL LSAME REAL CLANHS, SLAMCH - EXTERNAL CLADIV, LLSAME, CLANHS, SLAMCH + EXTERNAL CLADIV, LSAME, CLANHS, SLAMCH * .. * .. External Subroutines .. EXTERNAL CLARTG, CLASET, CROT, CSCAL, XERBLA @@ -351,6 +351,7 @@ ILSCHR = .TRUE. ISCHUR = 2 ELSE + ILSCHR = .TRUE. ISCHUR = 0 END IF * @@ -364,6 +365,7 @@ ILQ = .TRUE. ICOMPQ = 3 ELSE + ILQ = .TRUE. ICOMPQ = 0 END IF * @@ -377,6 +379,7 @@ ILZ = .TRUE. ICOMPZ = 3 ELSE + ILZ = .TRUE. ICOMPZ = 0 END IF * diff --git a/lapack-netlib/SRC/chseqr.f b/lapack-netlib/SRC/chseqr.f index cfcf725b2..32b6fa87b 100644 --- a/lapack-netlib/SRC/chseqr.f +++ b/lapack-netlib/SRC/chseqr.f @@ -320,10 +320,10 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare CLAHQR failure. NL > NTINY = 11 is +* . through a rare CLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/claqr0.f b/lapack-netlib/SRC/claqr0.f index 2f0ea20db..233721352 100644 --- a/lapack-netlib/SRC/claqr0.f +++ b/lapack-netlib/SRC/claqr0.f @@ -260,7 +260,7 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -355,22 +355,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'CLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -418,7 +418,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -558,7 +558,7 @@ * * ==== Got NS/2 or fewer shifts? Use CLAQR4 or * . CLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -659,7 +659,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/claqr4.f b/lapack-netlib/SRC/claqr4.f index fba286df7..94484e798 100644 --- a/lapack-netlib/SRC/claqr4.f +++ b/lapack-netlib/SRC/claqr4.f @@ -270,7 +270,7 @@ * . CLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -365,22 +365,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'CLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -428,7 +428,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -568,7 +568,7 @@ * * ==== Got NS/2 or fewer shifts? Use CLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -663,7 +663,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/claqr5.f b/lapack-netlib/SRC/claqr5.f index e4317a3ad..71f26d8c9 100644 --- a/lapack-netlib/SRC/claqr5.f +++ b/lapack-netlib/SRC/claqr5.f @@ -69,10 +69,9 @@ *> matrix entries. *> = 1: CLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: CLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -170,14 +169,14 @@ *> *> \param[out] U *> \verbatim -*> U is COMPLEX array, dimension (LDU,3*NSHFTS-3) +*> U is COMPLEX array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -189,7 +188,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is COMPLEX array, dimension (LDWV,3*NSHFTS-3) +*> WV is COMPLEX array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -215,7 +214,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -226,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup complexOTHERauxiliary * @@ -235,6 +234,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -244,10 +248,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE CLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV, $ WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -276,11 +285,11 @@ COMPLEX ALPHA, BETA, CDUM, REFSUM REAL H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP - INTEGER I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. REAL SLAMCH @@ -334,10 +343,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -349,28 +354,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL CLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -379,24 +395,156 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 10 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), + $ S( 2*M22 ), V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = CONJG( V( 1, M22 ) )* + $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) + IF( TST1.EQ.RZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + CABS1( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + CABS1( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + CABS1( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + CABS1( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + CABS1( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + CABS1( H( K+4, K+1 ) ) + END IF + IF( CABS1( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H21 = MIN( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H11 = MAX( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M22 ) ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL CLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ), $ S( 2*M ), V( 1, M ) ) ALPHA = V( 1, M ) CALL CLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*CONJG( V( 2, M ) ) + H( K+3, K+2 ) = H( K+3, K+2 ) - + $ REFSUM*CONJG( V( 3, M ) ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL CLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -444,7 +592,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -458,163 +606,32 @@ END IF END IF END IF - 10 CONTINUE * -* ==== Generate a 2-by-2 reflection, if needed. ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL CLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), - $ S( 2*M22 ), V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL CLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + H( J, K+3 ) = H( J, K+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 70 CONTINUE * -* ==== Multiply H by reflections from the left ==== +* ==== Perform update from left for subsequent +* . column. ==== * - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 30 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 20 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = CONJG( V( 1, M ) )* - $ ( H( K+1, J )+CONJG( V( 2, M ) )*H( K+2, J )+ - $ CONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 20 CONTINUE - 30 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 40 J = MAX( K+1, KTOP ), JBOT - REFSUM = CONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+CONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 40 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 80 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 50 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 50 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 60 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 60 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 70 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*CONJG( V( 3, M ) ) - 70 CONTINUE - END IF - END IF - 80 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 90 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 90 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 100 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 100 CONTINUE - ELSE IF( WANTZ ) THEN - DO 110 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*CONJG( V( 2, M22 ) ) - 110 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 120 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = CONJG( V( 1, M ) )*( H( K+1, K+1 ) + $ +CONJG( V( 2, M ) )*H( K+2, K+1 ) + $ +CONJG( V( 3, M ) )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -625,6 +642,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) IF( TST1.EQ.RZERO ) THEN @@ -658,22 +677,77 @@ $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF - 120 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 130 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*CONJG( V( 2, M ) ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*CONJG( V( 3, M ) ) - 130 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = CONJG( V( 1, M ) )* + $ ( H( K+1, J )+CONJG( V( 2, M ) )* + $ H( K+2, J )+CONJG( V( 3, M ) )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + U( J, KMS+3 ) = U( J, KMS+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*CONJG( V( 2, M ) ) + Z( J, K+3 ) = Z( J, K+3 ) - + $ REFSUM*CONJG( V( 3, M ) ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 140 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -687,220 +761,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 * -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== +* ==== Horizontal Multiply ==== * - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE * -* ==== Horizontal Multiply ==== +* ==== Vertical multiply ==== * - DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL CGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL CLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 150 CONTINUE + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE * -* ==== Vertical multiply ==== +* ==== Z multiply (also vertical) ==== * - DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) - 160 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 170 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL CGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL CLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 170 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL CLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL CLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL CTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**H ==== -* - CALL CGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL CLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL CTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL CLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 180 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL CLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL CLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 190 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 200 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL CLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL CLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL CTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL CLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL CTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL CGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL CLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 200 CONTINUE - END IF + $ Z( JROW, INCOL+K1 ), LDZ ) + 170 CONTINUE END IF END IF - 210 CONTINUE + 180 CONTINUE * * ==== End of CLAQR5 ==== * diff --git a/lapack-netlib/SRC/clarfb_gett.f b/lapack-netlib/SRC/clarfb_gett.f new file mode 100644 index 000000000..ee6959ed8 --- /dev/null +++ b/lapack-netlib/SRC/clarfb_gett.f @@ -0,0 +1,597 @@ +*> \brief \b CLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +*> +* Definition: +* =========== +* +* SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CLARFB_GETT applies a complex Householder block reflector H from the +*> left to a complex (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**H)*A1_in +*> +*> B1_out: = - V2*T*(V1**H)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T*A1_in +*> +*> B1_out: = - V2*T*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**H) * W2 +*> = (unit_lower_tr_of_(A1)**H) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**H) * W1 +*> = (unit_lower_tr_of_(A1)**H) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL CCOPY, CGEMM, CTRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL CCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL CTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL CGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB, + $ B( 1, K+1 ), LDB, CONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL CTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL CGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB, + $ WORK, LDWORK, CONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL CTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL CCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = CZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL CTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL CTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL CTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL CTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of CLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/ctgsja.f b/lapack-netlib/SRC/ctgsja.f index 38a61068e..c96cbe022 100644 --- a/lapack-netlib/SRC/ctgsja.f +++ b/lapack-netlib/SRC/ctgsja.f @@ -401,7 +401,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - REAL ZERO, ONE + REAL ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) COMPLEX CZERO, CONE PARAMETER ( CZERO = ( 0.0E+0, 0.0E+0 ), @@ -424,7 +424,8 @@ $ SLARTG, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, CONJG, MAX, MIN, REAL + INTRINSIC ABS, CONJG, MAX, MIN, REAL, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -610,9 +611,9 @@ * A1 = REAL( A( K+I, N-L+I ) ) B1 = REAL( B( I, N-L+I ) ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * IF( GAMMA.LT.ZERO ) THEN CALL CSSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB ) diff --git a/lapack-netlib/SRC/cungbr.f b/lapack-netlib/SRC/cungbr.f index df25799ca..0dddd42a6 100644 --- a/lapack-netlib/SRC/cungbr.f +++ b/lapack-netlib/SRC/cungbr.f @@ -222,8 +222,8 @@ CALL CUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL CUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL CUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -231,8 +231,8 @@ CALL CUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL CUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL CUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/cungtsqr_row.f b/lapack-netlib/SRC/cungtsqr_row.f new file mode 100644 index 000000000..e1597c58b --- /dev/null +++ b/lapack-netlib/SRC/cungtsqr_row.f @@ -0,0 +1,380 @@ +*> \brief \b CUNGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download CUNGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +*> +* Definition: +* =========== +* +* SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNGTSQR_ROW generates an M-by-N complex matrix Q_out with +*> orthonormal columns from the output of CLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by CLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of CLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine CLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which CLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by CLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by CLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by CLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See CLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See CLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complexOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE CUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + COMPLEX A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + COMPLEX DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL CLARFB_GETT, CLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'CUNGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL CLASET('U', M, N, CZERO, CONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL CLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL CLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL CLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = CMPLX( LWORKOPT ) + RETURN +* +* End of CUNGTSQR_ROW +* + END diff --git a/lapack-netlib/SRC/dgesdd.f b/lapack-netlib/SRC/dgesdd.f index 0218900d2..80d18041c 100644 --- a/lapack-netlib/SRC/dgesdd.f +++ b/lapack-netlib/SRC/dgesdd.f @@ -267,9 +267,9 @@ $ XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN DOUBLE PRECISION DLAMCH, DLANGE - EXTERNAL DLAMCH, DLANGE, LSAME + EXTERNAL DLAMCH, DLANGE, LSAME, DISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -599,6 +599,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = DLANGE( 'M', M, N, A, LDA, DUM ) + IF( DISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/dgetsqrhrt.f b/lapack-netlib/SRC/dgetsqrhrt.f new file mode 100644 index 000000000..668deeba8 --- /dev/null +++ b/lapack-netlib/SRC/dgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b DGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a real M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in DGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of DGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE + PARAMETER ( ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DLATSQR, DORGTSQR_ROW, DORHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for DLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for DORGTSQR_ROW; +* d) Diagonal D for DORHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of DORGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL DLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL DCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL DORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL DORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the DORHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN + DO J = I, N + A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL DCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN +* +* End of DGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/dggglm.f b/lapack-netlib/SRC/dggglm.f index 2e92912e0..1fbdc8add 100644 --- a/lapack-netlib/SRC/dggglm.f +++ b/lapack-netlib/SRC/dggglm.f @@ -270,8 +270,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = ZERO + END DO + DO I = 1, P + Y(I) = ZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/dhseqr.f b/lapack-netlib/SRC/dhseqr.f index b4fc3af90..6b7fb308f 100644 --- a/lapack-netlib/SRC/dhseqr.f +++ b/lapack-netlib/SRC/dhseqr.f @@ -338,10 +338,10 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare DLAHQR failure. NL > NTINY = 11 is +* . through a rare DLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/dlanv2.f b/lapack-netlib/SRC/dlanv2.f index 61b016f16..1c277c6bb 100644 --- a/lapack-netlib/SRC/dlanv2.f +++ b/lapack-netlib/SRC/dlanv2.f @@ -139,7 +139,7 @@ * ===================================================================== * * .. Parameters .. - DOUBLE PRECISION ZERO, HALF, ONE + DOUBLE PRECISION ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0D+0, HALF = 0.5D+0, ONE = 1.0D+0, $ TWO = 2.0D0 ) DOUBLE PRECISION MULTPL diff --git a/lapack-netlib/SRC/dlaqr0.f b/lapack-netlib/SRC/dlaqr0.f index f362c096c..8334d8d2b 100644 --- a/lapack-netlib/SRC/dlaqr0.f +++ b/lapack-netlib/SRC/dlaqr0.f @@ -278,7 +278,7 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -362,22 +362,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'DLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -425,7 +425,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -576,7 +576,7 @@ * * ==== Got NS/2 or fewer shifts? Use DLAQR4 or * . DLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -698,7 +698,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/dlaqr4.f b/lapack-netlib/SRC/dlaqr4.f index 454bf9608..163e55deb 100644 --- a/lapack-netlib/SRC/dlaqr4.f +++ b/lapack-netlib/SRC/dlaqr4.f @@ -284,7 +284,7 @@ * . DLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -368,22 +368,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'DLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -431,7 +431,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -582,7 +582,7 @@ * * ==== Got NS/2 or fewer shifts? Use DLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -697,7 +697,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/dlaqr5.f b/lapack-netlib/SRC/dlaqr5.f index f58db9c89..12e7db637 100644 --- a/lapack-netlib/SRC/dlaqr5.f +++ b/lapack-netlib/SRC/dlaqr5.f @@ -70,10 +70,9 @@ *> matrix entries. *> = 1: DLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: DLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -178,14 +177,14 @@ *> *> \param[out] U *> \verbatim -*> U is DOUBLE PRECISION array, dimension (LDU,3*NSHFTS-3) +*> U is DOUBLE PRECISION array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -197,7 +196,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is DOUBLE PRECISION array, dimension (LDWV,3*NSHFTS-3) +*> WV is DOUBLE PRECISION array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -223,7 +222,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -234,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup doubleOTHERauxiliary * @@ -243,6 +242,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -252,10 +256,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE DLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, $ LDU, NV, WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -282,11 +291,11 @@ DOUBLE PRECISION ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP - INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -356,10 +365,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -371,28 +376,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL DLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -401,17 +417,134 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 20 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL DLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), @@ -419,7 +552,20 @@ ALPHA = V( 1, M ) CALL DLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*V( 2, M ) + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL DLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -467,7 +613,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -481,154 +627,29 @@ END IF END IF END IF - 20 CONTINUE * -* ==== Generate a 2-by-2 reflection, if needed. ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL DLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), - $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), - $ V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL DLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) + H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + 70 CONTINUE * -* ==== Multiply H by reflections from the left ==== +* ==== Perform update from left for subsequent +* . column. ==== * - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 40 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 30 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 30 CONTINUE - 40 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 50 J = MAX( K+1, KTOP ), JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 50 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 90 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 60 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) - 60 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 70 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) - 70 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 80 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) - 80 CONTINUE - END IF - END IF - 90 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 100 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) - 100 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 110 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*V( 2, M22 ) - 110 CONTINUE - ELSE IF( WANTZ ) THEN - DO 120 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) - 120 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 130 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* + $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -639,6 +660,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN @@ -667,25 +690,77 @@ TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. - $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF END IF END IF - 130 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 140 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*V( 2, M ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) - 140 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* + $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 150 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -699,220 +774,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 * -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== +* ==== Horizontal Multiply ==== * - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 -* -* ==== Horizontal Multiply ==== -* - DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL DGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, $ LDWH ) - CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, + CALL DLACPY( 'ALL', NU, JLEN, WH, LDWH, $ H( INCOL+K1, JCOL ), LDH ) - 160 CONTINUE + 150 CONTINUE * -* ==== Vertical multiply ==== +* ==== Vertical multiply ==== * - DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE +* +* ==== Z multiply (also vertical) ==== +* + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) + $ Z( JROW, INCOL+K1 ), LDZ ) 170 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 180 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL DGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL DLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 180 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL DLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL DLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL DTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**T ==== -* - CALL DGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL DLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL DTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL DLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 190 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL DLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL DLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 200 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 210 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL DLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL DLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL DTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL DLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL DTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL DGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL DLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 210 CONTINUE - END IF END IF END IF - 220 CONTINUE + 180 CONTINUE * * ==== End of DLAQR5 ==== * diff --git a/lapack-netlib/SRC/dlarfb_gett.f b/lapack-netlib/SRC/dlarfb_gett.f new file mode 100644 index 000000000..10ab6461e --- /dev/null +++ b/lapack-netlib/SRC/dlarfb_gett.f @@ -0,0 +1,596 @@ +*> \brief \b DLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DLARFB_GETT applies a real Householder block reflector H from the +*> left to a real (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is DOUBLE PRECISION array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is DOUBLE PRECISION array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**T)*A1_in +*> +*> B1_out: = - V2*T*(V1**T)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T**A1_in +*> +*> B1_out: = - V2*T**A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**T) * W2 +*> = (unit_lower_tr_of_(A1)**T) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**T) * W1 +*> = (unit_lower_tr_of_(A1)**T) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL DCOPY, DGEMM, DTRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL DCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL DTRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL DGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB, + $ B( 1, K+1 ), LDB, ONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL DTRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL DGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB, + $ WORK, LDWORK, ONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL DTRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL DCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = ZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL DTRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL DTRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL DTRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL DTRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of DLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/dlasq2.f b/lapack-netlib/SRC/dlasq2.f index 68d922870..27eb1f79a 100644 --- a/lapack-netlib/SRC/dlasq2.f +++ b/lapack-netlib/SRC/dlasq2.f @@ -184,10 +184,18 @@ * * 2-by-2 case. * - IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN - INFO = -2 + IF( Z( 1 ).LT.ZERO ) THEN + INFO = -201 CALL XERBLA( 'DLASQ2', 2 ) RETURN + ELSE IF( Z( 2 ).LT.ZERO ) THEN + INFO = -202 + CALL XERBLA( 'DLASQ2', 2 ) + RETURN + ELSE IF( Z( 3 ).LT.ZERO ) THEN + INFO = -203 + CALL XERBLA( 'DLASQ2', 2 ) + RETURN ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN D = Z( 3 ) Z( 3 ) = Z( 1 ) diff --git a/lapack-netlib/SRC/dorgbr.f b/lapack-netlib/SRC/dorgbr.f index cfebda5ab..6868fc38d 100644 --- a/lapack-netlib/SRC/dorgbr.f +++ b/lapack-netlib/SRC/dorgbr.f @@ -221,8 +221,8 @@ CALL DORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL DORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL DORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -230,8 +230,8 @@ CALL DORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL DORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL DORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/dorgtsqr_row.f b/lapack-netlib/SRC/dorgtsqr_row.f new file mode 100644 index 000000000..94f8b0120 --- /dev/null +++ b/lapack-netlib/SRC/dorgtsqr_row.f @@ -0,0 +1,379 @@ +*> \brief \b DORGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download DORGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORGTSQR_ROW generates an M-by-N real matrix Q_out with +*> orthonormal columns from the output of DLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by DLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of DLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine DLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which DLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by DLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by DLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is DOUBLE PRECISION array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by DLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See DLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is DOUBLE PRECISION array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See DLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) DOUBLE PRECISION array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup doubleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE DORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + DOUBLE PRECISION A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ONE = 1.0D+0, ZERO = 0.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + DOUBLE PRECISION DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL DLARFB_GETT, DLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DBLE, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'DORGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL DLASET('U', M, N, ZERO, ONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL DLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL DLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL DLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = DBLE( LWORKOPT ) + RETURN +* +* End of DORGTSQR_ROW +* + END diff --git a/lapack-netlib/SRC/dtgsja.f b/lapack-netlib/SRC/dtgsja.f index 66f32b790..537bd3f4f 100644 --- a/lapack-netlib/SRC/dtgsja.f +++ b/lapack-netlib/SRC/dtgsja.f @@ -400,7 +400,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) * .. * .. Local Scalars .. @@ -419,7 +419,8 @@ $ DSCAL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, MAX, MIN + INTRINSIC ABS, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -596,9 +597,9 @@ * A1 = A( K+I, N-L+I ) B1 = B( I, N-L+I ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * * change sign if necessary * diff --git a/lapack-netlib/SRC/sgesdd.f b/lapack-netlib/SRC/sgesdd.f index 689494dd1..89e03a002 100644 --- a/lapack-netlib/SRC/sgesdd.f +++ b/lapack-netlib/SRC/sgesdd.f @@ -267,9 +267,9 @@ $ XERBLA * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, SISNAN REAL SLAMCH, SLANGE - EXTERNAL SLAMCH, SLANGE, LSAME + EXTERNAL SLAMCH, SLANGE, LSAME, SISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -599,6 +599,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = SLANGE( 'M', M, N, A, LDA, DUM ) + IF( SISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/sgetsqrhrt.f b/lapack-netlib/SRC/sgetsqrhrt.f new file mode 100644 index 000000000..f9580da7b --- /dev/null +++ b/lapack-netlib/SRC/sgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b SGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a complex M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in SGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of SGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is REAL array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup singleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE + PARAMETER ( ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SLATSQR, SORGTSQR_ROW, SORHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for SLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for SORGTSQR_ROW; +* d) Diagonal D for SORHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of SORGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL SLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL SCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL SORGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL SORHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the SORHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-ONE ) THEN + DO J = I, N + A( I, J ) = -ONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL SCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = REAL( LWORKOPT ) + RETURN +* +* End of SGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/sggglm.f b/lapack-netlib/SRC/sggglm.f index fe63da5f5..572ee511d 100644 --- a/lapack-netlib/SRC/sggglm.f +++ b/lapack-netlib/SRC/sggglm.f @@ -270,8 +270,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = ZERO + END DO + DO I = 1, P + Y(I) = ZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/shseqr.f b/lapack-netlib/SRC/shseqr.f index b5707f2c3..d22bd7b94 100644 --- a/lapack-netlib/SRC/shseqr.f +++ b/lapack-netlib/SRC/shseqr.f @@ -338,10 +338,10 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare SLAHQR failure. NL > NTINY = 11 is +* . through a rare SLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/slanv2.f b/lapack-netlib/SRC/slanv2.f index e678305f2..375645b75 100644 --- a/lapack-netlib/SRC/slanv2.f +++ b/lapack-netlib/SRC/slanv2.f @@ -139,7 +139,7 @@ * ===================================================================== * * .. Parameters .. - REAL ZERO, HALF, ONE + REAL ZERO, HALF, ONE, TWO PARAMETER ( ZERO = 0.0E+0, HALF = 0.5E+0, ONE = 1.0E+0, $ TWO = 2.0E+0 ) REAL MULTPL diff --git a/lapack-netlib/SRC/slaqr0.f b/lapack-netlib/SRC/slaqr0.f index 318b46943..b1ebaff75 100644 --- a/lapack-netlib/SRC/slaqr0.f +++ b/lapack-netlib/SRC/slaqr0.f @@ -277,7 +277,7 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -361,22 +361,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'SLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -424,7 +424,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -575,7 +575,7 @@ * * ==== Got NS/2 or fewer shifts? Use SLAQR4 or * . SLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -697,7 +697,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/slaqr4.f b/lapack-netlib/SRC/slaqr4.f index cd642e07f..4ba2f8757 100644 --- a/lapack-netlib/SRC/slaqr4.f +++ b/lapack-netlib/SRC/slaqr4.f @@ -287,7 +287,7 @@ * . SLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -371,22 +371,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'SLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -434,7 +434,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -585,7 +585,7 @@ * * ==== Got NS/2 or fewer shifts? Use SLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -700,7 +700,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/slaqr5.f b/lapack-netlib/SRC/slaqr5.f index f04ee577e..d60a1d3c0 100644 --- a/lapack-netlib/SRC/slaqr5.f +++ b/lapack-netlib/SRC/slaqr5.f @@ -70,10 +70,9 @@ *> matrix entries. *> = 1: SLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: SLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -178,14 +177,14 @@ *> *> \param[out] U *> \verbatim -*> U is REAL array, dimension (LDU,3*NSHFTS-3) +*> U is REAL array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -197,7 +196,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is REAL array, dimension (LDWV,3*NSHFTS-3) +*> WV is REAL array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -223,7 +222,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -234,7 +233,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup realOTHERauxiliary * @@ -243,6 +242,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -252,10 +256,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE SLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, $ SR, SI, H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, $ LDU, NV, WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -282,11 +291,11 @@ REAL ALPHA, BETA, H11, H12, H21, H22, REFSUM, $ SAFMAX, SAFMIN, SCL, SMLNUM, SWAP, TST1, TST2, $ ULP - INTEGER I, I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I, I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. REAL SLAMCH @@ -356,10 +365,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -371,28 +376,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 220 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL SLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS-1 columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 150 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -401,17 +417,134 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 20 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), + $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), + $ V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) + IF( TST1.EQ.ZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + ABS( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + ABS( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + ABS( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + ABS( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + ABS( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + ABS( H( K+4, K+1 ) ) + END IF + IF( ABS( H( K+1, K ) ).LE.MAX( SMLNUM, ULP*TST1 ) ) + $ THEN + H12 = MAX( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H21 = MIN( ABS( H( K+1, K ) ), + $ ABS( H( K, K+1 ) ) ) + H11 = MAX( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( ABS( H( K+1, K+1 ) ), + $ ABS( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M22 ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL SLAQR1( 3, H( KTOP, KTOP ), LDH, SR( 2*M-1 ), $ SI( 2*M-1 ), SR( 2*M ), SI( 2*M ), @@ -419,7 +552,20 @@ ALPHA = V( 1, M ) CALL SLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*V( 2, M ) + H( K+3, K+2 ) = H( K+3, K+2 ) - REFSUM*V( 3, M ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL SLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -467,7 +613,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -481,154 +627,29 @@ END IF END IF END IF - 20 CONTINUE * -* ==== Generate a 2-by-2 reflection, if needed. ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL SLAQR1( 2, H( K+1, K+1 ), LDH, SR( 2*M22-1 ), - $ SI( 2*M22-1 ), SR( 2*M22 ), SI( 2*M22 ), - $ V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL SLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF -* -* ==== Multiply H by reflections from the left ==== -* - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 40 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 30 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* - $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 30 CONTINUE - 40 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 50 J = MAX( K+1, KTOP ), JBOT - REFSUM = V( 1, M22 )*( H( K+1, J )+V( 2, M22 )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 50 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 90 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 60 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) - H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) - 60 CONTINUE + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M ) + H( J, K+3 ) = H( J, K+3 ) - REFSUM*V( 3, M ) + 70 CONTINUE * - IF( ACCUM ) THEN +* ==== Perform update from left for subsequent +* . column. ==== * -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 70 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) - U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) - 70 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 80 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) - Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) - 80 CONTINUE - END IF - END IF - 90 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 100 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - REFSUM*V( 2, M22 ) - 100 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 110 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM* - $ V( 2, M22 ) - 110 CONTINUE - ELSE IF( WANTZ ) THEN - DO 120 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M22 ) - 120 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 130 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = V( 1, M )*( H( K+1, K+1 )+V( 2, M )* + $ H( K+2, K+1 )+V( 3, M )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -639,6 +660,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = ABS( H( K, K ) ) + ABS( H( K+1, K+1 ) ) IF( TST1.EQ.ZERO ) THEN @@ -667,25 +690,77 @@ TST2 = H22*( H11 / SCL ) * IF( TST2.EQ.ZERO .OR. H21*( H12 / SCL ).LE. - $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + $ MAX( SMLNUM, ULP*TST2 ) ) THEN + H( K+1, K ) = ZERO + END IF END IF END IF - 130 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 140 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*V( 2, M ) - H( K+4, K+3 ) = H( K+4, K+3 ) - REFSUM*V( 3, M ) - 140 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = V( 1, M )*( H( K+1, J )+V( 2, M )* + $ H( K+2, J )+V( 3, M )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - REFSUM*V( 2, M ) + U( J, KMS+3 ) = U( J, KMS+3 ) - REFSUM*V( 3, M ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - REFSUM*V( 2, M ) + Z( J, K+3 ) = Z( J, K+3 ) - REFSUM*V( 3, M ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 150 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -699,220 +774,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 * -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== +* ==== Horizontal Multiply ==== * - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE * -* ==== Horizontal Multiply ==== +* ==== Vertical multiply ==== * - DO 160 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL SGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL SLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 160 CONTINUE + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE * -* ==== Vertical multiply ==== +* ==== Z multiply (also vertical) ==== * - DO 170 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) + $ Z( JROW, INCOL+K1 ), LDZ ) 170 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 180 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL SGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL SLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 180 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 190 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL SLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL SLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL STRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**T ==== -* - CALL SGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL SLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**T ==== -* - CALL STRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL SLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 190 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 200 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL SLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL SLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 200 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 210 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL SLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL SLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL STRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL SLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL STRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL SGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL SLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 210 CONTINUE - END IF END IF END IF - 220 CONTINUE + 180 CONTINUE * * ==== End of SLAQR5 ==== * diff --git a/lapack-netlib/SRC/slarfb_gett.f b/lapack-netlib/SRC/slarfb_gett.f new file mode 100644 index 000000000..7719f2965 --- /dev/null +++ b/lapack-netlib/SRC/slarfb_gett.f @@ -0,0 +1,596 @@ +*> \brief \b SLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SLARFB_GETT applies a real Householder block reflector H from the +*> left to a real (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is REAL array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is REAL array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup singleOTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**T ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**T, V2**T ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**T ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**T ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**T)*A1_in +*> +*> B1_out: = - V2*T*(V1**T)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**T)*A2_in + (V2**T)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T**A1_in +*> +*> B1_out: = - V2*T**A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**T)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**T)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**T) * W2 = (unit_lower_tr_of_(A1)**T) * W2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**T) * W1 = (unit_lower_tr_of_(A1)**T) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**T) * W2 +*> = (unit_lower_tr_of_(A1)**T) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**T) * W1 +*> = (unit_lower_tr_of_(A1)**T) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + REAL A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL SCOPY, SGEMM, STRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL SCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**T) * W2 = (A1**T) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL STRMM( 'L', 'L', 'T', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**T) * B2 = W2 + (B1**T) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL SGEMM( 'T', 'N', K, N-K, M, ONE, B, LDB, + $ B( 1, K+1 ), LDB, ONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL STRMM( 'L', 'U', 'N', 'N', K, N-K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL SGEMM( 'N', 'N', M, N-K, K, -ONE, B, LDB, + $ WORK, LDWORK, ONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL STRMM( 'L', 'L', 'N', 'U', K, N-K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL SCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = ZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**T) * W1 = (A1**T) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL STRMM( 'L', 'L', 'T', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL STRMM( 'L', 'U', 'N', 'N', K, K, ONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL STRMM( 'R', 'U', 'N', 'N', M, K, -ONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL STRMM( 'L', 'L', 'N', 'U', K, K, ONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of SLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/slasq2.f b/lapack-netlib/SRC/slasq2.f index 6e5f86447..219797c4a 100644 --- a/lapack-netlib/SRC/slasq2.f +++ b/lapack-netlib/SRC/slasq2.f @@ -183,10 +183,18 @@ * * 2-by-2 case. * - IF( Z( 2 ).LT.ZERO .OR. Z( 3 ).LT.ZERO ) THEN - INFO = -2 + IF( Z( 1 ).LT.ZERO ) THEN + INFO = -201 + CALL XERBLA( 'DLASQ2', 2 ) + RETURN + ELSE IF( Z( 2 ).LT.ZERO ) THEN + INFO = -202 CALL XERBLA( 'SLASQ2', 2 ) RETURN + ELSE IF( Z( 3 ).LT.ZERO ) THEN + INFO = -203 + CALL XERBLA( 'SLASQ2', 2 ) + RETURN ELSE IF( Z( 3 ).GT.Z( 1 ) ) THEN D = Z( 3 ) Z( 3 ) = Z( 1 ) diff --git a/lapack-netlib/SRC/sorgbr.f b/lapack-netlib/SRC/sorgbr.f index dccdbb58a..2266505dc 100644 --- a/lapack-netlib/SRC/sorgbr.f +++ b/lapack-netlib/SRC/sorgbr.f @@ -221,8 +221,8 @@ CALL SORGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL SORGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL SORGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -230,8 +230,8 @@ CALL SORGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL SORGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL SORGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/sorgtsqr_row.f b/lapack-netlib/SRC/sorgtsqr_row.f new file mode 100644 index 000000000..d2a2150cd --- /dev/null +++ b/lapack-netlib/SRC/sorgtsqr_row.f @@ -0,0 +1,379 @@ +*> \brief \b SORGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download SORGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORGTSQR_ROW generates an M-by-N real matrix Q_out with +*> orthonormal columns from the output of SLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by SLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of SLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine SLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which SLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by SLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by SLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is REAL array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by SLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See SLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is REAL array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See SLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) REAL array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup sigleOTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE SORGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + REAL A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ONE = 1.0E+0, ZERO = 0.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + REAL DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL SLARFB_GETT, SLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC REAL, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'SORGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = REAL( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL SLASET('U', M, N, ZERO, ONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL SLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL SLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL SLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = REAL( LWORKOPT ) + RETURN +* +* End of SORGTSQR_ROW +* + END diff --git a/lapack-netlib/SRC/stgsja.f b/lapack-netlib/SRC/stgsja.f index 2a6fc354d..7324da431 100644 --- a/lapack-netlib/SRC/stgsja.f +++ b/lapack-netlib/SRC/stgsja.f @@ -400,7 +400,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - REAL ZERO, ONE + REAL ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) * .. * .. Local Scalars .. @@ -419,7 +419,8 @@ $ SSCAL, XERBLA * .. * .. Intrinsic Functions .. - INTRINSIC ABS, MAX, MIN + INTRINSIC ABS, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -596,9 +597,9 @@ * A1 = A( K+I, N-L+I ) B1 = B( I, N-L+I ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * * change sign if necessary * diff --git a/lapack-netlib/SRC/zgesdd.f b/lapack-netlib/SRC/zgesdd.f index bb9d2c26e..2209f4733 100644 --- a/lapack-netlib/SRC/zgesdd.f +++ b/lapack-netlib/SRC/zgesdd.f @@ -281,9 +281,9 @@ $ ZLASET, ZUNGBR, ZUNGLQ, ZUNGQR, ZUNMBR * .. * .. External Functions .. - LOGICAL LSAME + LOGICAL LSAME, DISNAN DOUBLE PRECISION DLAMCH, ZLANGE - EXTERNAL LSAME, DLAMCH, ZLANGE + EXTERNAL LSAME, DLAMCH, ZLANGE, DISNAN * .. * .. Intrinsic Functions .. INTRINSIC INT, MAX, MIN, SQRT @@ -647,6 +647,10 @@ * Scale A if max element outside range [SMLNUM,BIGNUM] * ANRM = ZLANGE( 'M', M, N, A, LDA, DUM ) + IF( DISNAN( ANRM ) ) THEN + INFO = -4 + RETURN + END IF ISCL = 0 IF( ANRM.GT.ZERO .AND. ANRM.LT.SMLNUM ) THEN ISCL = 1 diff --git a/lapack-netlib/SRC/zgetsqrhrt.f b/lapack-netlib/SRC/zgetsqrhrt.f new file mode 100644 index 000000000..5f0167937 --- /dev/null +++ b/lapack-netlib/SRC/zgetsqrhrt.f @@ -0,0 +1,349 @@ +*> \brief \b ZGETSQRHRT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZGETSQRHRT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZGETSQRHRT computes a NB2-sized column blocked QR-factorization +*> of a complex M-by-N matrix A with M >= N, +*> +*> A = Q * R. +*> +*> The routine uses internally a NB1-sized column blocked and MB1-sized +*> row blocked TSQR-factorization and perfors the reconstruction +*> of the Householder vectors from the TSQR output. The routine also +*> converts the R_tsqr factor from the TSQR-factorization output into +*> the R factor that corresponds to the Householder QR-factorization, +*> +*> A = Q_tsqr * R_tsqr = Q * R. +*> +*> The output Q and R factors are stored in the same format as in ZGEQRT +*> (Q is in blocked compact WY-representation). See the documentation +*> of ZGEQRT for more details on the format. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> The row block size to be used in the blocked TSQR. +*> MB1 > N. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> The column block size to be used in the blocked TSQR. +*> N >= NB1 >= 1. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> The block size to be used in the blocked QR that is +*> output. NB2 >= 1. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: an M-by-N matrix A. +*> +*> On exit: +*> a) the elements on and above the diagonal +*> of the array contain the N-by-N upper-triangular +*> matrix R corresponding to the Householder QR; +*> b) the elements below the diagonal represent Q by +*> the columns of blocked V (compact WY-representation). +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[out] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,N)) +*> The upper triangular block reflectors stored in compact form +*> as a sequence of upper triangular blocks. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= NB2. +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ), +*> where +*> NUM_ALL_ROW_BLOCKS = CEIL((M-N)/(MB1-N)), +*> NB1LOCAL = MIN(NB1,N). +*> LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL, +*> LW1 = NB1LOCAL * N, +*> LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ), +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup comlpex16OTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZGETSQRHRT( M, N, MB1, NB1, NB2, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, NB1, NB2, MB1 +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER I, IINFO, J, LW1, LW2, LWT, LDWT, LWORKOPT, + $ NB1LOCAL, NB2LOCAL, NUM_ALL_ROW_BLOCKS +* .. +* .. External Subroutines .. + EXTERNAL ZCOPY, ZLATSQR, ZUNGTSQR_ROW, ZUNHR_COL, + $ XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input arguments +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB1.LE.N ) THEN + INFO = -3 + ELSE IF( NB1.LT.1 ) THEN + INFO = -4 + ELSE IF( NB2.LT.1 ) THEN + INFO = -5 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -7 + ELSE IF( LDT.LT.MAX( 1, MIN( NB2, N ) ) ) THEN + INFO = -9 + ELSE +* +* Test the input LWORK for the dimension of the array WORK. +* This workspace is used to store array: +* a) Matrix T and WORK for ZLATSQR; +* b) N-by-N upper-triangular factor R_tsqr; +* c) Matrix T and array WORK for ZUNGTSQR_ROW; +* d) Diagonal D for ZUNHR_COL. +* + IF( LWORK.LT.N*N+1 .AND. .NOT.LQUERY ) THEN + INFO = -11 + ELSE +* +* Set block size for column blocks +* + NB1LOCAL = MIN( NB1, N ) +* + NUM_ALL_ROW_BLOCKS = MAX( 1, + $ CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* +* Length and leading dimension of WORK array to place +* T array in TSQR. +* + LWT = NUM_ALL_ROW_BLOCKS * N * NB1LOCAL + + LDWT = NB1LOCAL +* +* Length of TSQR work array +* + LW1 = NB1LOCAL * N +* +* Length of ZUNGTSQR_ROW work array. +* + LW2 = NB1LOCAL * MAX( NB1LOCAL, ( N - NB1LOCAL ) ) +* + LWORKOPT = MAX( LWT + LW1, MAX( LWT+N*N+LW2, LWT+N*N+N ) ) +* + IF( ( LWORK.LT.MAX( 1, LWORKOPT ) ).AND.(.NOT.LQUERY) ) THEN + INFO = -11 + END IF +* + END IF + END IF +* +* Handle error in the input parameters and return workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZGETSQRHRT', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* + NB2LOCAL = MIN( NB2, N ) +* +* +* (1) Perform TSQR-factorization of the M-by-N matrix A. +* + CALL ZLATSQR( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK(LWT+1), LW1, IINFO ) +* +* (2) Copy the factor R_tsqr stored in the upper-triangular part +* of A into the square matrix in the work array +* WORK(LWT+1:LWT+N*N) column-by-column. +* + DO J = 1, N + CALL ZCOPY( J, A( 1, J ), 1, WORK( LWT + N*(J-1)+1 ), 1 ) + END DO +* +* (3) Generate a M-by-N matrix Q with orthonormal columns from +* the result stored below the diagonal in the array A in place. +* + + CALL ZUNGTSQR_ROW( M, N, MB1, NB1LOCAL, A, LDA, WORK, LDWT, + $ WORK( LWT+N*N+1 ), LW2, IINFO ) +* +* (4) Perform the reconstruction of Householder vectors from +* the matrix Q (stored in A) in place. +* + CALL ZUNHR_COL( M, N, NB2LOCAL, A, LDA, T, LDT, + $ WORK( LWT+N*N+1 ), IINFO ) +* +* (5) Copy the factor R_tsqr stored in the square matrix in the +* work array WORK(LWT+1:LWT+N*N) into the upper-triangular +* part of A. +* +* (6) Compute from R_tsqr the factor R_hr corresponding to +* the reconstructed Householder vectors, i.e. R_hr = S * R_tsqr. +* This multiplication by the sign matrix S on the left means +* changing the sign of I-th row of the matrix R_tsqr according +* to sign of the I-th diagonal element DIAG(I) of the matrix S. +* DIAG is stored in WORK( LWT+N*N+1 ) from the ZUNHR_COL output. +* +* (5) and (6) can be combined in a single loop, so the rows in A +* are accessed only once. +* + DO I = 1, N + IF( WORK( LWT+N*N+I ).EQ.-CONE ) THEN + DO J = I, N + A( I, J ) = -CONE * WORK( LWT+N*(J-1)+I ) + END DO + ELSE + CALL ZCOPY( N-I+1, WORK(LWT+N*(I-1)+I), N, A( I, I ), LDA ) + END IF + END DO +* + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN +* +* End of ZGETSQRHRT +* + END \ No newline at end of file diff --git a/lapack-netlib/SRC/zggglm.f b/lapack-netlib/SRC/zggglm.f index d6a30cee7..d4adc5c4d 100644 --- a/lapack-netlib/SRC/zggglm.f +++ b/lapack-netlib/SRC/zggglm.f @@ -271,8 +271,15 @@ * * Quick return if possible * - IF( N.EQ.0 ) - $ RETURN + IF( N.EQ.0 ) THEN + DO I = 1, M + X(I) = CZERO + END DO + DO I = 1, P + Y(I) = CZERO + END DO + RETURN + END IF * * Compute the GQR factorization of matrices A and B: * diff --git a/lapack-netlib/SRC/zhgeqz.f b/lapack-netlib/SRC/zhgeqz.f index b28ae47a4..960244727 100644 --- a/lapack-netlib/SRC/zhgeqz.f +++ b/lapack-netlib/SRC/zhgeqz.f @@ -319,7 +319,7 @@ DOUBLE PRECISION ABSB, ANORM, ASCALE, ATOL, BNORM, BSCALE, BTOL, $ C, SAFMIN, TEMP, TEMP2, TEMPR, ULP COMPLEX*16 ABI22, AD11, AD12, AD21, AD22, CTEMP, CTEMP2, - $ CTEMP3, ESHIFT, RTDISC, S, SHIFT, SIGNBC, T1, + $ CTEMP3, ESHIFT, S, SHIFT, SIGNBC, $ U12, X, ABI12, Y * .. * .. External Functions .. @@ -352,6 +352,7 @@ ILSCHR = .TRUE. ISCHUR = 2 ELSE + ILSCHR = .TRUE. ISCHUR = 0 END IF * @@ -365,6 +366,7 @@ ILQ = .TRUE. ICOMPQ = 3 ELSE + ILQ = .TRUE. ICOMPQ = 0 END IF * @@ -378,6 +380,7 @@ ILZ = .TRUE. ICOMPZ = 3 ELSE + ILZ = .TRUE. ICOMPZ = 0 END IF * diff --git a/lapack-netlib/SRC/zhseqr.f b/lapack-netlib/SRC/zhseqr.f index 2ee874dfd..e0fddd3a7 100644 --- a/lapack-netlib/SRC/zhseqr.f +++ b/lapack-netlib/SRC/zhseqr.f @@ -320,10 +320,10 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== NL allocates some local workspace to help small matrices -* . through a rare ZLAHQR failure. NL > NTINY = 11 is +* . through a rare ZLAHQR failure. NL > NTINY = 15 is * . required and NL <= NMIN = ILAENV(ISPEC=12,...) is recom- * . mended. (The default value of NMIN is 75.) Using NL = 49 * . allows up to six simultaneous shifts and a 16-by-16 diff --git a/lapack-netlib/SRC/zlaqr0.f b/lapack-netlib/SRC/zlaqr0.f index feffe9782..edf01bc7c 100644 --- a/lapack-netlib/SRC/zlaqr0.f +++ b/lapack-netlib/SRC/zlaqr0.f @@ -262,7 +262,7 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -357,22 +357,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'ZLAQR0', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -420,7 +420,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -560,7 +560,7 @@ * * ==== Got NS/2 or fewer shifts? Use ZLAQR4 or * . ZLAHQR on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -661,7 +661,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/zlaqr4.f b/lapack-netlib/SRC/zlaqr4.f index a88f6508e..6d083fcda 100644 --- a/lapack-netlib/SRC/zlaqr4.f +++ b/lapack-netlib/SRC/zlaqr4.f @@ -268,7 +268,7 @@ * . ZLAHQR because of insufficient subdiagonal scratch space. * . (This is a hard limit.) ==== INTEGER NTINY - PARAMETER ( NTINY = 11 ) + PARAMETER ( NTINY = 15 ) * * ==== Exceptional deflation windows: try to cure rare * . slow convergence by varying the size of the @@ -363,22 +363,22 @@ END IF * * ==== NWR = recommended deflation window size. At this -* . point, N .GT. NTINY = 11, so there is enough +* . point, N .GT. NTINY = 15, so there is enough * . subdiagonal workspace for NWR.GE.2 as required. * . (In fact, there is enough subdiagonal space for -* . NWR.GE.3.) ==== +* . NWR.GE.4.) ==== * NWR = ILAENV( 13, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) NWR = MAX( 2, NWR ) NWR = MIN( IHI-ILO+1, ( N-1 ) / 3, NWR ) * * ==== NSR = recommended number of simultaneous shifts. -* . At this point N .GT. NTINY = 11, so there is at +* . At this point N .GT. NTINY = 15, so there is at * . enough subdiagonal workspace for NSR to be even * . and greater than or equal to two as required. ==== * NSR = ILAENV( 15, 'ZLAQR4', JBCMPZ, N, ILO, IHI, LWORK ) - NSR = MIN( NSR, ( N+6 ) / 9, IHI-ILO ) + NSR = MIN( NSR, ( N-3 ) / 6, IHI-ILO ) NSR = MAX( 2, NSR-MOD( NSR, 2 ) ) * * ==== Estimate optimal workspace ==== @@ -426,7 +426,7 @@ * ==== NSMAX = the Largest number of simultaneous shifts * . for which there is sufficient workspace. ==== * - NSMAX = MIN( ( N+6 ) / 9, 2*LWORK / 3 ) + NSMAX = MIN( ( N-3 ) / 6, 2*LWORK / 3 ) NSMAX = NSMAX - MOD( NSMAX, 2 ) * * ==== NDFL: an iteration count restarted at deflation. ==== @@ -566,7 +566,7 @@ * * ==== Got NS/2 or fewer shifts? Use ZLAHQR * . on a trailing principal submatrix to -* . get more. (Since NS.LE.NSMAX.LE.(N+6)/9, +* . get more. (Since NS.LE.NSMAX.LE.(N-3)/6, * . there is enough space below the subdiagonal * . to fit an NS-by-NS scratch array.) ==== * @@ -661,7 +661,7 @@ * . (NVE-by-KDU) vertical work WV arrow along * . the left-hand-edge. ==== * - KDU = 3*NS - 3 + KDU = 2*NS KU = N - KDU + 1 KWH = KDU + 1 NHO = ( N-KDU+1-4 ) - ( KDU+1 ) + 1 diff --git a/lapack-netlib/SRC/zlaqr5.f b/lapack-netlib/SRC/zlaqr5.f index 9ff7e7eca..c12f4b780 100644 --- a/lapack-netlib/SRC/zlaqr5.f +++ b/lapack-netlib/SRC/zlaqr5.f @@ -69,10 +69,9 @@ *> matrix entries. *> = 1: ZLAQR5 accumulates reflections and uses matrix-matrix *> multiply to update the far-from-diagonal matrix entries. -*> = 2: ZLAQR5 accumulates reflections, uses matrix-matrix -*> multiply to update the far-from-diagonal matrix entries, -*> and takes advantage of 2-by-2 block structure during -*> matrix multiplies. +*> = 2: Same as KACC22 = 1. This option used to enable exploiting +*> the 2-by-2 structure during matrix multiplications, but +*> this is no longer supported. *> \endverbatim *> *> \param[in] N @@ -170,14 +169,14 @@ *> *> \param[out] U *> \verbatim -*> U is COMPLEX*16 array, dimension (LDU,3*NSHFTS-3) +*> U is COMPLEX*16 array, dimension (LDU,2*NSHFTS) *> \endverbatim *> *> \param[in] LDU *> \verbatim *> LDU is INTEGER *> LDU is the leading dimension of U just as declared in the -*> in the calling subroutine. LDU >= 3*NSHFTS-3. +*> in the calling subroutine. LDU >= 2*NSHFTS. *> \endverbatim *> *> \param[in] NV @@ -189,7 +188,7 @@ *> *> \param[out] WV *> \verbatim -*> WV is COMPLEX*16 array, dimension (LDWV,3*NSHFTS-3) +*> WV is COMPLEX*16 array, dimension (LDWV,2*NSHFTS) *> \endverbatim *> *> \param[in] LDWV @@ -215,7 +214,7 @@ *> \verbatim *> LDWH is INTEGER *> Leading dimension of WH just as declared in the -*> calling procedure. LDWH >= 3*NSHFTS-3. +*> calling procedure. LDWH >= 2*NSHFTS. *> \endverbatim *> * Authors: @@ -226,7 +225,7 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date June 2016 +*> \date January 2021 * *> \ingroup complex16OTHERauxiliary * @@ -235,6 +234,11 @@ *> *> Karen Braman and Ralph Byers, Department of Mathematics, *> University of Kansas, USA +*> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang +*> +*> Thijs Steel, Department of Computer science, +*> KU Leuven, Belgium * *> \par References: * ================ @@ -244,10 +248,15 @@ *> Performance, SIAM Journal of Matrix Analysis, volume 23, pages *> 929--947, 2002. *> +*> Lars Karlsson, Daniel Kressner, and Bruno Lang, Optimally packed +*> chains of bulges in multishift QR algorithms. +*> ACM Trans. Math. Softw. 40, 2, Article 12 (February 2014). +*> * ===================================================================== SUBROUTINE ZLAQR5( WANTT, WANTZ, KACC22, N, KTOP, KBOT, NSHFTS, S, $ H, LDH, ILOZ, IHIZ, Z, LDZ, V, LDV, U, LDU, NV, $ WV, LDWV, NH, WH, LDWH ) + IMPLICIT NONE * * -- LAPACK auxiliary routine (version 3.7.1) -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- @@ -276,11 +285,11 @@ COMPLEX*16 ALPHA, BETA, CDUM, REFSUM DOUBLE PRECISION H11, H12, H21, H22, SAFMAX, SAFMIN, SCL, $ SMLNUM, TST1, TST2, ULP - INTEGER I2, I4, INCOL, J, J2, J4, JBOT, JCOL, JLEN, - $ JROW, JTOP, K, K1, KDU, KMS, KNZ, KRCOL, KZS, - $ M, M22, MBOT, MEND, MSTART, MTOP, NBMPS, NDCOL, + INTEGER I2, I4, INCOL, J, JBOT, JCOL, JLEN, + $ JROW, JTOP, K, K1, KDU, KMS, KRCOL, + $ M, M22, MBOT, MTOP, NBMPS, NDCOL, $ NS, NU - LOGICAL ACCUM, BLK22, BMP22 + LOGICAL ACCUM, BMP22 * .. * .. External Functions .. DOUBLE PRECISION DLAMCH @@ -334,10 +343,6 @@ * ACCUM = ( KACC22.EQ.1 ) .OR. ( KACC22.EQ.2 ) * -* ==== If so, exploit the 2-by-2 block structure? ==== -* - BLK22 = ( NS.GT.2 ) .AND. ( KACC22.EQ.2 ) -* * ==== clear trash ==== * IF( KTOP+2.LE.KBOT ) @@ -349,28 +354,39 @@ * * ==== KDU = width of slab ==== * - KDU = 6*NBMPS - 3 + KDU = 4*NBMPS * * ==== Create and chase chains of NBMPS bulges ==== * - DO 210 INCOL = 3*( 1-NBMPS ) + KTOP - 1, KBOT - 2, 3*NBMPS - 2 + DO 180 INCOL = KTOP - 2*NBMPS + 1, KBOT - 2, 2*NBMPS +* +* JTOP = Index from which updates from the right start. +* + IF( ACCUM ) THEN + JTOP = MAX( KTOP, INCOL ) + ELSE IF( WANTT ) THEN + JTOP = 1 + ELSE + JTOP = KTOP + END IF +* NDCOL = INCOL + KDU IF( ACCUM ) $ CALL ZLASET( 'ALL', KDU, KDU, ZERO, ONE, U, LDU ) * * ==== Near-the-diagonal bulge chase. The following loop * . performs the near-the-diagonal part of a small bulge -* . multi-shift QR sweep. Each 6*NBMPS-2 column diagonal +* . multi-shift QR sweep. Each 4*NBMPS column diagonal * . chunk extends from column INCOL to column NDCOL * . (including both column INCOL and column NDCOL). The -* . following loop chases a 3*NBMPS column long chain of -* . NBMPS bulges 3*NBMPS-2 columns to the right. (INCOL +* . following loop chases a 2*NBMPS+1 column long chain of +* . NBMPS bulges 2*NBMPS columns to the right. (INCOL * . may be less than KTOP and and NDCOL may be greater than * . KBOT indicating phantom columns from which to chase * . bulges before they are actually introduced or to which * . to chase bulges beyond column KBOT.) ==== * - DO 140 KRCOL = INCOL, MIN( INCOL+3*NBMPS-3, KBOT-2 ) + DO 145 KRCOL = INCOL, MIN( INCOL+2*NBMPS-1, KBOT-2 ) * * ==== Bulges number MTOP to MBOT are active double implicit * . shift bulges. There may or may not also be small @@ -379,24 +395,156 @@ * . down the diagonal to make room. The phantom matrix * . paradigm described above helps keep track. ==== * - MTOP = MAX( 1, ( ( KTOP-1 )-KRCOL+2 ) / 3+1 ) - MBOT = MIN( NBMPS, ( KBOT-KRCOL ) / 3 ) + MTOP = MAX( 1, ( KTOP-KRCOL ) / 2+1 ) + MBOT = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 2 ) M22 = MBOT + 1 - BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+3*( M22-1 ) ).EQ. + BMP22 = ( MBOT.LT.NBMPS ) .AND. ( KRCOL+2*( M22-1 ) ).EQ. $ ( KBOT-2 ) * * ==== Generate reflections to chase the chain right * . one column. (The minimum value of K is KTOP-1.) ==== * - DO 10 M = MTOP, MBOT - K = KRCOL + 3*( M-1 ) + IF ( BMP22 ) THEN +* +* ==== Special case: 2-by-2 reflection at bottom treated +* . separately ==== +* + K = KRCOL + 2*( M22-1 ) + IF( K.EQ.KTOP-1 ) THEN + CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), + $ S( 2*M22 ), V( 1, M22 ) ) + BETA = V( 1, M22 ) + CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + ELSE + BETA = H( K+1, K ) + V( 2, M22 ) = H( K+2, K ) + CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) + H( K+1, K ) = BETA + H( K+2, K ) = ZERO + END IF + +* +* ==== Perform update from right within +* . computational window. ==== +* + DO 30 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* + $ H( J, K+2 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 30 CONTINUE +* +* ==== Perform update from left within +* . computational window. ==== +* + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF + DO 40 J = K+1, JBOT + REFSUM = DCONJG( V( 1, M22 ) )* + $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* + $ H( K+2, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) + 40 CONTINUE +* +* ==== The following convergence test requires that +* . the tradition small-compared-to-nearby-diagonals +* . criterion and the Ahues & Tisseur (LAWN 122, 1997) +* . criteria both be satisfied. The latter improves +* . accuracy in some examples. Falling back on an +* . alternate convergence criterion when TST1 or TST2 +* . is zero (as done here) is traditional but probably +* . unnecessary. ==== +* + IF( K.GE.KTOP ) THEN + IF( H( K+1, K ).NE.ZERO ) THEN + TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) + IF( TST1.EQ.RZERO ) THEN + IF( K.GE.KTOP+1 ) + $ TST1 = TST1 + CABS1( H( K, K-1 ) ) + IF( K.GE.KTOP+2 ) + $ TST1 = TST1 + CABS1( H( K, K-2 ) ) + IF( K.GE.KTOP+3 ) + $ TST1 = TST1 + CABS1( H( K, K-3 ) ) + IF( K.LE.KBOT-2 ) + $ TST1 = TST1 + CABS1( H( K+2, K+1 ) ) + IF( K.LE.KBOT-3 ) + $ TST1 = TST1 + CABS1( H( K+3, K+1 ) ) + IF( K.LE.KBOT-4 ) + $ TST1 = TST1 + CABS1( H( K+4, K+1 ) ) + END IF + IF( CABS1( H( K+1, K ) ) + $ .LE.MAX( SMLNUM, ULP*TST1 ) ) THEN + H12 = MAX( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H21 = MIN( CABS1( H( K+1, K ) ), + $ CABS1( H( K, K+1 ) ) ) + H11 = MAX( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + H22 = MIN( CABS1( H( K+1, K+1 ) ), + $ CABS1( H( K, K )-H( K+1, K+1 ) ) ) + SCL = H11 + H12 + TST2 = H22*( H11 / SCL ) +* + IF( TST2.EQ.RZERO .OR. H21*( H12 / SCL ).LE. + $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO + END IF + END IF + END IF +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN + KMS = K - INCOL + DO 50 J = MAX( 1, KTOP-INCOL ), KDU + REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ + $ V( 2, M22 )*U( J, KMS+2 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 50 CONTINUE + ELSE IF( WANTZ ) THEN + DO 60 J = ILOZ, IHIZ + REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* + $ Z( J, K+2 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M22 ) ) + 60 CONTINUE + END IF + END IF +* +* ==== Normal case: Chain of 3-by-3 reflections ==== +* + DO 80 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) IF( K.EQ.KTOP-1 ) THEN CALL ZLAQR1( 3, H( KTOP, KTOP ), LDH, S( 2*M-1 ), $ S( 2*M ), V( 1, M ) ) ALPHA = V( 1, M ) CALL ZLARFG( 3, ALPHA, V( 2, M ), 1, V( 1, M ) ) ELSE - BETA = H( K+1, K ) +* +* ==== Perform delayed transformation of row below +* . Mth bulge. Exploit fact that first two elements +* . of row are actually zero. ==== +* + REFSUM = V( 1, M )*V( 3, M )*H( K+3, K+2 ) + H( K+3, K ) = -REFSUM + H( K+3, K+1 ) = -REFSUM*DCONJG( V( 2, M ) ) + H( K+3, K+2 ) = H( K+3, K+2 ) - + $ REFSUM*DCONJG( V( 3, M ) ) +* +* ==== Calculate reflection to move +* . Mth bulge one step. ==== +* + BETA = H( K+1, K ) V( 2, M ) = H( K+2, K ) V( 3, M ) = H( K+3, K ) CALL ZLARFG( 3, BETA, V( 2, M ), 1, V( 1, M ) ) @@ -444,7 +592,7 @@ H( K+3, K ) = ZERO ELSE * -* ==== Stating a new bulge here would +* ==== Starting a new bulge here would * . create only negligible fill. * . Replace the old reflector with * . the new one. ==== @@ -458,163 +606,32 @@ END IF END IF END IF - 10 CONTINUE * -* ==== Generate a 2-by-2 reflection, if needed. ==== +* ==== Apply reflection from the right and +* . the first column of update from the left. +* . These updates are required for the vigilant +* . deflation check. We still delay most of the +* . updates from the left for efficiency. ==== * - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF( K.EQ.KTOP-1 ) THEN - CALL ZLAQR1( 2, H( K+1, K+1 ), LDH, S( 2*M22-1 ), - $ S( 2*M22 ), V( 1, M22 ) ) - BETA = V( 1, M22 ) - CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - ELSE - BETA = H( K+1, K ) - V( 2, M22 ) = H( K+2, K ) - CALL ZLARFG( 2, BETA, V( 2, M22 ), 1, V( 1, M22 ) ) - H( K+1, K ) = BETA - H( K+2, K ) = ZERO - END IF - END IF + DO 70 J = JTOP, MIN( KBOT, K+3 ) + REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* + $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) + H( J, K+1 ) = H( J, K+1 ) - REFSUM + H( J, K+2 ) = H( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + H( J, K+3 ) = H( J, K+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 70 CONTINUE * -* ==== Multiply H by reflections from the left ==== +* ==== Perform update from left for subsequent +* . column. ==== * - IF( ACCUM ) THEN - JBOT = MIN( NDCOL, KBOT ) - ELSE IF( WANTT ) THEN - JBOT = N - ELSE - JBOT = KBOT - END IF - DO 30 J = MAX( KTOP, KRCOL ), JBOT - MEND = MIN( MBOT, ( J-KRCOL+2 ) / 3 ) - DO 20 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = DCONJG( V( 1, M ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M ) )* - $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) - H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) - 20 CONTINUE - 30 CONTINUE - IF( BMP22 ) THEN - K = KRCOL + 3*( M22-1 ) - DO 40 J = MAX( K+1, KTOP ), JBOT - REFSUM = DCONJG( V( 1, M22 ) )* - $ ( H( K+1, J )+DCONJG( V( 2, M22 ) )* - $ H( K+2, J ) ) - H( K+1, J ) = H( K+1, J ) - REFSUM - H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M22 ) - 40 CONTINUE - END IF -* -* ==== Multiply H by reflections from the right. -* . Delay filling in the last row until the -* . vigilant deflation check is complete. ==== -* - IF( ACCUM ) THEN - JTOP = MAX( KTOP, INCOL ) - ELSE IF( WANTT ) THEN - JTOP = 1 - ELSE - JTOP = KTOP - END IF - DO 80 M = MTOP, MBOT - IF( V( 1, M ).NE.ZERO ) THEN - K = KRCOL + 3*( M-1 ) - DO 50 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M )*( H( J, K+1 )+V( 2, M )* - $ H( J, K+2 )+V( 3, M )*H( J, K+3 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - H( J, K+3 ) = H( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 50 CONTINUE -* - IF( ACCUM ) THEN -* -* ==== Accumulate U. (If necessary, update Z later -* . with with an efficient matrix-matrix -* . multiply.) ==== -* - KMS = K - INCOL - DO 60 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* - $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - U( J, KMS+3 ) = U( J, KMS+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 60 CONTINUE - ELSE IF( WANTZ ) THEN -* -* ==== U is not accumulated, so update Z -* . now by multiplying by reflections -* . from the right. ==== -* - DO 70 J = ILOZ, IHIZ - REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* - $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M ) ) - Z( J, K+3 ) = Z( J, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 70 CONTINUE - END IF - END IF - 80 CONTINUE -* -* ==== Special case: 2-by-2 reflection (if needed) ==== -* - K = KRCOL + 3*( M22-1 ) - IF( BMP22 ) THEN - IF ( V( 1, M22 ).NE.ZERO ) THEN - DO 90 J = JTOP, MIN( KBOT, K+3 ) - REFSUM = V( 1, M22 )*( H( J, K+1 )+V( 2, M22 )* - $ H( J, K+2 ) ) - H( J, K+1 ) = H( J, K+1 ) - REFSUM - H( J, K+2 ) = H( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 90 CONTINUE -* - IF( ACCUM ) THEN - KMS = K - INCOL - DO 100 J = MAX( 1, KTOP-INCOL ), KDU - REFSUM = V( 1, M22 )*( U( J, KMS+1 )+ - $ V( 2, M22 )*U( J, KMS+2 ) ) - U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM - U( J, KMS+2 ) = U( J, KMS+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 100 CONTINUE - ELSE IF( WANTZ ) THEN - DO 110 J = ILOZ, IHIZ - REFSUM = V( 1, M22 )*( Z( J, K+1 )+V( 2, M22 )* - $ Z( J, K+2 ) ) - Z( J, K+1 ) = Z( J, K+1 ) - REFSUM - Z( J, K+2 ) = Z( J, K+2 ) - - $ REFSUM*DCONJG( V( 2, M22 ) ) - 110 CONTINUE - END IF - END IF - END IF -* -* ==== Vigilant deflation check ==== -* - MSTART = MTOP - IF( KRCOL+3*( MSTART-1 ).LT.KTOP ) - $ MSTART = MSTART + 1 - MEND = MBOT - IF( BMP22 ) - $ MEND = MEND + 1 - IF( KRCOL.EQ.KBOT-2 ) - $ MEND = MEND + 1 - DO 120 M = MSTART, MEND - K = MIN( KBOT-1, KRCOL+3*( M-1 ) ) + REFSUM = DCONJG( V( 1, M ) )*( H( K+1, K+1 ) + $ +DCONJG( V( 2, M ) )*H( K+2, K+1 ) + $ +DCONJG( V( 3, M ) )*H( K+3, K+1 ) ) + H( K+1, K+1 ) = H( K+1, K+1 ) - REFSUM + H( K+2, K+1 ) = H( K+2, K+1 ) - REFSUM*V( 2, M ) + H( K+3, K+1 ) = H( K+3, K+1 ) - REFSUM*V( 3, M ) * * ==== The following convergence test requires that * . the tradition small-compared-to-nearby-diagonals @@ -625,6 +642,8 @@ * . is zero (as done here) is traditional but probably * . unnecessary. ==== * + IF( K.LT.KTOP) + $ CYCLE IF( H( K+1, K ).NE.ZERO ) THEN TST1 = CABS1( H( K, K ) ) + CABS1( H( K+1, K+1 ) ) IF( TST1.EQ.RZERO ) THEN @@ -658,23 +677,77 @@ $ MAX( SMLNUM, ULP*TST2 ) )H( K+1, K ) = ZERO END IF END IF - 120 CONTINUE + 80 CONTINUE * -* ==== Fill in the last row of each bulge. ==== +* ==== Multiply H by reflections from the left ==== * - MEND = MIN( NBMPS, ( KBOT-KRCOL-1 ) / 3 ) - DO 130 M = MTOP, MEND - K = KRCOL + 3*( M-1 ) - REFSUM = V( 1, M )*V( 3, M )*H( K+4, K+3 ) - H( K+4, K+1 ) = -REFSUM - H( K+4, K+2 ) = -REFSUM*DCONJG( V( 2, M ) ) - H( K+4, K+3 ) = H( K+4, K+3 ) - - $ REFSUM*DCONJG( V( 3, M ) ) - 130 CONTINUE + IF( ACCUM ) THEN + JBOT = MIN( NDCOL, KBOT ) + ELSE IF( WANTT ) THEN + JBOT = N + ELSE + JBOT = KBOT + END IF +* + DO 100 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 90 J = MAX( KTOP, KRCOL + 2*M ), JBOT + REFSUM = DCONJG( V( 1, M ) )* + $ ( H( K+1, J )+DCONJG( V( 2, M ) )* + $ H( K+2, J )+DCONJG( V( 3, M ) )*H( K+3, J ) ) + H( K+1, J ) = H( K+1, J ) - REFSUM + H( K+2, J ) = H( K+2, J ) - REFSUM*V( 2, M ) + H( K+3, J ) = H( K+3, J ) - REFSUM*V( 3, M ) + 90 CONTINUE + 100 CONTINUE +* +* ==== Accumulate orthogonal transformations. ==== +* + IF( ACCUM ) THEN +* +* ==== Accumulate U. (If needed, update Z later +* . with an efficient matrix-matrix +* . multiply.) ==== +* + DO 120 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + KMS = K - INCOL + I2 = MAX( 1, KTOP-INCOL ) + I2 = MAX( I2, KMS-(KRCOL-INCOL)+1 ) + I4 = MIN( KDU, KRCOL + 2*( MBOT-1 ) - INCOL + 5 ) + DO 110 J = I2, I4 + REFSUM = V( 1, M )*( U( J, KMS+1 )+V( 2, M )* + $ U( J, KMS+2 )+V( 3, M )*U( J, KMS+3 ) ) + U( J, KMS+1 ) = U( J, KMS+1 ) - REFSUM + U( J, KMS+2 ) = U( J, KMS+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + U( J, KMS+3 ) = U( J, KMS+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 110 CONTINUE + 120 CONTINUE + ELSE IF( WANTZ ) THEN +* +* ==== U is not accumulated, so update Z +* . now by multiplying by reflections +* . from the right. ==== +* + DO 140 M = MBOT, MTOP, -1 + K = KRCOL + 2*( M-1 ) + DO 130 J = ILOZ, IHIZ + REFSUM = V( 1, M )*( Z( J, K+1 )+V( 2, M )* + $ Z( J, K+2 )+V( 3, M )*Z( J, K+3 ) ) + Z( J, K+1 ) = Z( J, K+1 ) - REFSUM + Z( J, K+2 ) = Z( J, K+2 ) - + $ REFSUM*DCONJG( V( 2, M ) ) + Z( J, K+3 ) = Z( J, K+3 ) - + $ REFSUM*DCONJG( V( 3, M ) ) + 130 CONTINUE + 140 CONTINUE + END IF * * ==== End of near-the-diagonal bulge chase. ==== * - 140 CONTINUE + 145 CONTINUE * * ==== Use U (if accumulated) to update far-from-diagonal * . entries in H. If required, use U to update Z as @@ -688,220 +761,45 @@ JTOP = KTOP JBOT = KBOT END IF - IF( ( .NOT.BLK22 ) .OR. ( INCOL.LT.KTOP ) .OR. - $ ( NDCOL.GT.KBOT ) .OR. ( NS.LE.2 ) ) THEN + K1 = MAX( 1, KTOP-INCOL ) + NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 * -* ==== Updates not exploiting the 2-by-2 block -* . structure of U. K1 and NU keep track of -* . the location and size of U in the special -* . cases of introducing bulges and chasing -* . bulges off the bottom. In these special -* . cases and in case the number of shifts -* . is NS = 2, there is no 2-by-2 block -* . structure to exploit. ==== +* ==== Horizontal Multiply ==== * - K1 = MAX( 1, KTOP-INCOL ) - NU = ( KDU-MAX( 0, NDCOL-KBOT ) ) - K1 + 1 + DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH + JLEN = MIN( NH, JBOT-JCOL+1 ) + CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), + $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, + $ LDWH ) + CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH, + $ H( INCOL+K1, JCOL ), LDH ) + 150 CONTINUE * -* ==== Horizontal Multiply ==== +* ==== Vertical multiply ==== * - DO 150 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) - CALL ZGEMM( 'C', 'N', NU, JLEN, NU, ONE, U( K1, K1 ), - $ LDU, H( INCOL+K1, JCOL ), LDH, ZERO, WH, - $ LDWH ) - CALL ZLACPY( 'ALL', NU, JLEN, WH, LDWH, - $ H( INCOL+K1, JCOL ), LDH ) - 150 CONTINUE + DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV + JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, + $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ LDU, ZERO, WV, LDWV ) + CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, + $ H( JROW, INCOL+K1 ), LDH ) + 160 CONTINUE * -* ==== Vertical multiply ==== +* ==== Z multiply (also vertical) ==== * - DO 160 JROW = JTOP, MAX( KTOP, INCOL ) - 1, NV - JLEN = MIN( NV, MAX( KTOP, INCOL )-JROW ) + IF( WANTZ ) THEN + DO 170 JROW = ILOZ, IHIZ, NV + JLEN = MIN( NV, IHIZ-JROW+1 ) CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ H( JROW, INCOL+K1 ), LDH, U( K1, K1 ), + $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), $ LDU, ZERO, WV, LDWV ) CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ H( JROW, INCOL+K1 ), LDH ) - 160 CONTINUE -* -* ==== Z multiply (also vertical) ==== -* - IF( WANTZ ) THEN - DO 170 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) - CALL ZGEMM( 'N', 'N', JLEN, NU, NU, ONE, - $ Z( JROW, INCOL+K1 ), LDZ, U( K1, K1 ), - $ LDU, ZERO, WV, LDWV ) - CALL ZLACPY( 'ALL', JLEN, NU, WV, LDWV, - $ Z( JROW, INCOL+K1 ), LDZ ) - 170 CONTINUE - END IF - ELSE -* -* ==== Updates exploiting U's 2-by-2 block structure. -* . (I2, I4, J2, J4 are the last rows and columns -* . of the blocks.) ==== -* - I2 = ( KDU+1 ) / 2 - I4 = KDU - J2 = I4 - I2 - J4 = KDU -* -* ==== KZS and KNZ deal with the band of zeros -* . along the diagonal of one of the triangular -* . blocks. ==== -* - KZS = ( J4-J2 ) - ( NS+1 ) - KNZ = NS + 1 -* -* ==== Horizontal multiply ==== -* - DO 180 JCOL = MIN( NDCOL, KBOT ) + 1, JBOT, NH - JLEN = MIN( NH, JBOT-JCOL+1 ) -* -* ==== Copy bottom of H to top+KZS of scratch ==== -* (The first KZS rows get multiplied by zero.) ==== -* - CALL ZLACPY( 'ALL', KNZ, JLEN, H( INCOL+1+J2, JCOL ), - $ LDH, WH( KZS+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL ZLASET( 'ALL', KZS, JLEN, ZERO, ZERO, WH, LDWH ) - CALL ZTRMM( 'L', 'U', 'C', 'N', KNZ, JLEN, ONE, - $ U( J2+1, 1+KZS ), LDU, WH( KZS+1, 1 ), - $ LDWH ) -* -* ==== Multiply top of H by U11**H ==== -* - CALL ZGEMM( 'C', 'N', I2, JLEN, J2, ONE, U, LDU, - $ H( INCOL+1, JCOL ), LDH, ONE, WH, LDWH ) -* -* ==== Copy top of H to bottom of WH ==== -* - CALL ZLACPY( 'ALL', J2, JLEN, H( INCOL+1, JCOL ), LDH, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U21**H ==== -* - CALL ZTRMM( 'L', 'L', 'C', 'N', J2, JLEN, ONE, - $ U( 1, I2+1 ), LDU, WH( I2+1, 1 ), LDWH ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'C', 'N', I4-I2, JLEN, J4-J2, ONE, - $ U( J2+1, I2+1 ), LDU, - $ H( INCOL+1+J2, JCOL ), LDH, ONE, - $ WH( I2+1, 1 ), LDWH ) -* -* ==== Copy it back ==== -* - CALL ZLACPY( 'ALL', KDU, JLEN, WH, LDWH, - $ H( INCOL+1, JCOL ), LDH ) - 180 CONTINUE -* -* ==== Vertical multiply ==== -* - DO 190 JROW = JTOP, MAX( INCOL, KTOP ) - 1, NV - JLEN = MIN( NV, MAX( INCOL, KTOP )-JROW ) -* -* ==== Copy right of H to scratch (the first KZS -* . columns get multiplied by zero) ==== -* - CALL ZLACPY( 'ALL', JLEN, KNZ, H( JROW, INCOL+1+J2 ), - $ LDH, WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, LDWV ) - CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ H( JROW, INCOL+1 ), LDH, U, LDU, ONE, WV, - $ LDWV ) -* -* ==== Copy left of H to right of scratch ==== -* - CALL ZLACPY( 'ALL', JLEN, J2, H( JROW, INCOL+1 ), LDH, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ H( JROW, INCOL+1+J2 ), LDH, - $ U( J2+1, I2+1 ), LDU, ONE, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Copy it back ==== -* - CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ H( JROW, INCOL+1 ), LDH ) - 190 CONTINUE -* -* ==== Multiply Z (also vertical) ==== -* - IF( WANTZ ) THEN - DO 200 JROW = ILOZ, IHIZ, NV - JLEN = MIN( NV, IHIZ-JROW+1 ) -* -* ==== Copy right of Z to left of scratch (first -* . KZS columns get multiplied by zero) ==== -* - CALL ZLACPY( 'ALL', JLEN, KNZ, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ WV( 1, 1+KZS ), LDWV ) -* -* ==== Multiply by U12 ==== -* - CALL ZLASET( 'ALL', JLEN, KZS, ZERO, ZERO, WV, - $ LDWV ) - CALL ZTRMM( 'R', 'U', 'N', 'N', JLEN, KNZ, ONE, - $ U( J2+1, 1+KZS ), LDU, WV( 1, 1+KZS ), - $ LDWV ) -* -* ==== Multiply by U11 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I2, J2, ONE, - $ Z( JROW, INCOL+1 ), LDZ, U, LDU, ONE, - $ WV, LDWV ) -* -* ==== Copy left of Z to right of scratch ==== -* - CALL ZLACPY( 'ALL', JLEN, J2, Z( JROW, INCOL+1 ), - $ LDZ, WV( 1, 1+I2 ), LDWV ) -* -* ==== Multiply by U21 ==== -* - CALL ZTRMM( 'R', 'L', 'N', 'N', JLEN, I4-I2, ONE, - $ U( 1, I2+1 ), LDU, WV( 1, 1+I2 ), - $ LDWV ) -* -* ==== Multiply by U22 ==== -* - CALL ZGEMM( 'N', 'N', JLEN, I4-I2, J4-J2, ONE, - $ Z( JROW, INCOL+1+J2 ), LDZ, - $ U( J2+1, I2+1 ), LDU, ONE, - $ WV( 1, 1+I2 ), LDWV ) -* -* ==== Copy the result back to Z ==== -* - CALL ZLACPY( 'ALL', JLEN, KDU, WV, LDWV, - $ Z( JROW, INCOL+1 ), LDZ ) - 200 CONTINUE - END IF + $ Z( JROW, INCOL+K1 ), LDZ ) + 170 CONTINUE END IF END IF - 210 CONTINUE + 180 CONTINUE * * ==== End of ZLAQR5 ==== * diff --git a/lapack-netlib/SRC/zlarfb_gett.f b/lapack-netlib/SRC/zlarfb_gett.f new file mode 100644 index 000000000..4a3c4dcf1 --- /dev/null +++ b/lapack-netlib/SRC/zlarfb_gett.f @@ -0,0 +1,597 @@ +*> \brief \b ZLARFB_GETT +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZLARFB_GETT + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, +* $ WORK, LDWORK ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* CHARACTER IDENT +* INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), B( LDB, * ), T( LDT, * ), +* $ WORK( LDWORK, * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZLARFB_GETT applies a complex Householder block reflector H from the +*> left to a complex (K+M)-by-N "triangular-pentagonal" matrix +*> composed of two block matrices: an upper trapezoidal K-by-N matrix A +*> stored in the array A, and a rectangular M-by-(N-K) matrix B, stored +*> in the array B. The block reflector H is stored in a compact +*> WY-representation, where the elementary reflectors are in the +*> arrays A, B and T. See Further Details section. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] IDENT +*> \verbatim +*> IDENT is CHARACTER*1 +*> If IDENT = not 'I', or not 'i', then V1 is unit +*> lower-triangular and stored in the left K-by-K block of +*> the input matrix A, +*> If IDENT = 'I' or 'i', then V1 is an identity matrix and +*> not stored. +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix B. +*> M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrices A and B. +*> N >= 0. +*> \endverbatim +*> +*> \param[in] K +*> \verbatim +*> K is INTEGER +*> The number or rows of the matrix A. +*> K is also order of the matrix T, i.e. the number of +*> elementary reflectors whose product defines the block +*> reflector. 0 <= K <= N. +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, dimension (LDT,K) +*> The upper-triangular K-by-K matrix T in the representation +*> of the block reflector. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. LDT >= K. +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: +*> a) In the K-by-N upper-trapezoidal part A: input matrix A. +*> b) In the columns below the diagonal: columns of V1 +*> (ones are not stored on the diagonal). +*> +*> On exit: +*> A is overwritten by rectangular K-by-N product H*A. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array A. LDA >= max(1,K). +*> \endverbatim +*> +*> \param[in,out] B +*> \verbatim +*> B is COMPLEX*16 array, dimension (LDB,N) +*> +*> On entry: +*> a) In the M-by-(N-K) right block: input matrix B. +*> b) In the M-by-N left block: columns of V2. +*> +*> On exit: +*> B is overwritten by rectangular M-by-N product H*B. +*> +*> See Further Details section. +*> \endverbatim +*> +*> \param[in] LDB +*> \verbatim +*> LDB is INTEGER +*> The leading dimension of the array B. LDB >= max(1,M). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> WORK is COMPLEX*16 array, +*> dimension (LDWORK,max(K,N-K)) +*> \endverbatim +*> +*> \param[in] LDWORK +*> \verbatim +*> LDWORK is INTEGER +*> The leading dimension of the array WORK. LDWORK>=max(1,K). +*> +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16OTHERauxiliary +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +* +*> \par Further Details: +* ===================== +*> +*> \verbatim +*> +*> (1) Description of the Algebraic Operation. +*> +*> The matrix A is a K-by-N matrix composed of two column block +*> matrices, A1, which is K-by-K, and A2, which is K-by-(N-K): +*> A = ( A1, A2 ). +*> The matrix B is an M-by-N matrix composed of two column block +*> matrices, B1, which is M-by-K, and B2, which is M-by-(N-K): +*> B = ( B1, B2 ). +*> +*> Perform the operation: +*> +*> ( A_out ) := H * ( A_in ) = ( I - V * T * V**H ) * ( A_in ) = +*> ( B_out ) ( B_in ) ( B_in ) +*> = ( I - ( V1 ) * T * ( V1**H, V2**H ) ) * ( A_in ) +*> ( V2 ) ( B_in ) +*> On input: +*> +*> a) ( A_in ) consists of two block columns: +*> ( B_in ) +*> +*> ( A_in ) = (( A1_in ) ( A2_in )) = (( A1_in ) ( A2_in )) +*> ( B_in ) (( B1_in ) ( B2_in )) (( 0 ) ( B2_in )), +*> +*> where the column blocks are: +*> +*> ( A1_in ) is a K-by-K upper-triangular matrix stored in the +*> upper triangular part of the array A(1:K,1:K). +*> ( B1_in ) is an M-by-K rectangular ZERO matrix and not stored. +*> +*> ( A2_in ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_in ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> b) V = ( V1 ) +*> ( V2 ) +*> +*> where: +*> 1) if IDENT == 'I',V1 is a K-by-K identity matrix, not stored; +*> 2) if IDENT != 'I',V1 is a K-by-K unit lower-triangular matrix, +*> stored in the lower-triangular part of the array +*> A(1:K,1:K) (ones are not stored), +*> and V2 is an M-by-K rectangular stored the array B(1:M,1:K), +*> (because on input B1_in is a rectangular zero +*> matrix that is not stored and the space is +*> used to store V2). +*> +*> c) T is a K-by-K upper-triangular matrix stored +*> in the array T(1:K,1:K). +*> +*> On output: +*> +*> a) ( A_out ) consists of two block columns: +*> ( B_out ) +*> +*> ( A_out ) = (( A1_out ) ( A2_out )) +*> ( B_out ) (( B1_out ) ( B2_out )), +*> +*> where the column blocks are: +*> +*> ( A1_out ) is a K-by-K square matrix, or a K-by-K +*> upper-triangular matrix, if V1 is an +*> identity matrix. AiOut is stored in +*> the array A(1:K,1:K). +*> ( B1_out ) is an M-by-K rectangular matrix stored +*> in the array B(1:M,K:N). +*> +*> ( A2_out ) is a K-by-(N-K) rectangular matrix stored +*> in the array A(1:K,K+1:N). +*> ( B2_out ) is an M-by-(N-K) rectangular matrix stored +*> in the array B(1:M,K+1:N). +*> +*> +*> The operation above can be represented as the same operation +*> on each block column: +*> +*> ( A1_out ) := H * ( A1_in ) = ( I - V * T * V**H ) * ( A1_in ) +*> ( B1_out ) ( 0 ) ( 0 ) +*> +*> ( A2_out ) := H * ( A2_in ) = ( I - V * T * V**H ) * ( A2_in ) +*> ( B2_out ) ( B2_in ) ( B2_in ) +*> +*> If IDENT != 'I': +*> +*> The computation for column block 1: +*> +*> A1_out: = A1_in - V1*T*(V1**H)*A1_in +*> +*> B1_out: = - V2*T*(V1**H)*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - V1*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( (V1**H)*A2_in + (V2**H)*B2_in ) +*> +*> If IDENT == 'I': +*> +*> The operation for column block 1: +*> +*> A1_out: = A1_in - V1*T*A1_in +*> +*> B1_out: = - V2*T*A1_in +*> +*> The computation for column block 2, which exists if N > K: +*> +*> A2_out: = A2_in - T*( A2_in + (V2**H)*B2_in ) +*> +*> B2_out: = B2_in - V2*T*( A2_in + (V2**H)*B2_in ) +*> +*> (2) Description of the Algorithmic Computation. +*> +*> In the first step, we compute column block 2, i.e. A2 and B2. +*> Here, we need to use the K-by-(N-K) rectangular workspace +*> matrix W2 that is of the same size as the matrix A2. +*> W2 is stored in the array WORK(1:K,1:(N-K)). +*> +*> In the second step, we compute column block 1, i.e. A1 and B1. +*> Here, we need to use the K-by-K square workspace matrix W1 +*> that is of the same size as the as the matrix A1. +*> W1 is stored in the array WORK(1:K,1:K). +*> +*> NOTE: Hence, in this routine, we need the workspace array WORK +*> only of size WORK(1:K,1:max(K,N-K)) so it can hold both W2 from +*> the first step and W1 from the second step. +*> +*> Case (A), when V1 is unit lower-triangular, i.e. IDENT != 'I', +*> more computations than in the Case (B). +*> +*> if( IDENT != 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(2) W2: = (V1**H) * W2 = (unit_lower_tr_of_(A1)**H) * W2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(2) W1: = (V1**H) * W1 = (unit_lower_tr_of_(A1)**H) * W1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6) square A1: = A1 - W1 +*> end if +*> end if +*> +*> Case (B), when V1 is an identity matrix, i.e. IDENT == 'I', +*> less computations than in the Case (A) +*> +*> if( IDENT == 'I' ) then +*> if ( N > K ) then +*> (First Step - column block 2) +*> col2_(1) W2: = A2 +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> col2_(7) A2: = A2 - W2 +*> else +*> (Second Step - column block 1) +*> col1_(1) W1: = A1 +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> col1_(6) upper-triangular_of_(A1): = A1 - W1 +*> end if +*> end if +*> +*> Combine these cases (A) and (B) together, this is the resulting +*> algorithm: +*> +*> if ( N > K ) then +*> +*> (First Step - column block 2) +*> +*> col2_(1) W2: = A2 +*> if( IDENT != 'I' ) then +*> col2_(2) W2: = (V1**H) * W2 +*> = (unit_lower_tr_of_(A1)**H) * W2 +*> end if +*> col2_(3) W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2] +*> col2_(4) W2: = T * W2 +*> col2_(5) B2: = B2 - V2 * W2 = B2 - B1 * W2 +*> if( IDENT != 'I' ) then +*> col2_(6) W2: = V1 * W2 = unit_lower_tr_of_(A1) * W2 +*> end if +*> col2_(7) A2: = A2 - W2 +*> +*> else +*> +*> (Second Step - column block 1) +*> +*> col1_(1) W1: = A1 +*> if( IDENT != 'I' ) then +*> col1_(2) W1: = (V1**H) * W1 +*> = (unit_lower_tr_of_(A1)**H) * W1 +*> end if +*> col1_(3) W1: = T * W1 +*> col1_(4) B1: = - V2 * W1 = - B1 * W1 +*> if( IDENT != 'I' ) then +*> col1_(5) square W1: = V1 * W1 = unit_lower_tr_of_(A1) * W1 +*> col1_(6_a) below_diag_of_(A1): = - below_diag_of_(W1) +*> end if +*> col1_(6_b) up_tr_of_(A1): = up_tr_of_(A1) - up_tr_of_(W1) +*> +*> end if +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZLARFB_GETT( IDENT, M, N, K, T, LDT, A, LDA, B, LDB, + $ WORK, LDWORK ) + IMPLICIT NONE +* +* -- LAPACK auxiliary routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + CHARACTER IDENT + INTEGER K, LDA, LDB, LDT, LDWORK, M, N +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), B( LDB, * ), T( LDT, * ), + $ WORK( LDWORK, * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LNOTIDENT + INTEGER I, J +* .. +* .. EXTERNAL FUNCTIONS .. + LOGICAL LSAME + EXTERNAL LSAME +* .. +* .. External Subroutines .. + EXTERNAL ZCOPY, ZGEMM, ZTRMM +* .. +* .. Executable Statements .. +* +* Quick return if possible +* + IF( M.LT.0 .OR. N.LE.0 .OR. K.EQ.0 .OR. K.GT.N ) + $ RETURN +* + LNOTIDENT = .NOT.LSAME( IDENT, 'I' ) +* +* ------------------------------------------------------------------ +* +* First Step. Computation of the Column Block 2: +* +* ( A2 ) := H * ( A2 ) +* ( B2 ) ( B2 ) +* +* ------------------------------------------------------------------ +* + IF( N.GT.K ) THEN +* +* col2_(1) Compute W2: = A2. Therefore, copy A2 = A(1:K, K+1:N) +* into W2=WORK(1:K, 1:N-K) column-by-column. +* + DO J = 1, N-K + CALL ZCOPY( K, A( 1, K+J ), 1, WORK( 1, J ), 1 ) + END DO + + IF( LNOTIDENT ) THEN +* +* col2_(2) Compute W2: = (V1**H) * W2 = (A1**H) * W2, +* V1 is not an identy matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored). +* +* + CALL ZTRMM( 'L', 'L', 'C', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(3) Compute W2: = W2 + (V2**H) * B2 = W2 + (B1**H) * B2 +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL ZGEMM( 'C', 'N', K, N-K, M, CONE, B, LDB, + $ B( 1, K+1 ), LDB, CONE, WORK, LDWORK ) + END IF +* +* col2_(4) Compute W2: = T * W2, +* T is upper-triangular. +* + CALL ZTRMM( 'L', 'U', 'N', 'N', K, N-K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col2_(5) Compute B2: = B2 - V2 * W2 = B2 - B1 * W2, +* V2 stored in B1. +* + IF( M.GT.0 ) THEN + CALL ZGEMM( 'N', 'N', M, N-K, K, -CONE, B, LDB, + $ WORK, LDWORK, CONE, B( 1, K+1 ), LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col2_(6) Compute W2: = V1 * W2 = A1 * W2, +* V1 is not an identity matrix, but unit lower-triangular, +* V1 stored in A1 (diagonal ones are not stored). +* + CALL ZTRMM( 'L', 'L', 'N', 'U', K, N-K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col2_(7) Compute A2: = A2 - W2 = +* = A(1:K, K+1:N-K) - WORK(1:K, 1:N-K), +* column-by-column. +* + DO J = 1, N-K + DO I = 1, K + A( I, K+J ) = A( I, K+J ) - WORK( I, J ) + END DO + END DO +* + END IF +* +* ------------------------------------------------------------------ +* +* Second Step. Computation of the Column Block 1: +* +* ( A1 ) := H * ( A1 ) +* ( B1 ) ( 0 ) +* +* ------------------------------------------------------------------ +* +* col1_(1) Compute W1: = A1. Copy the upper-triangular +* A1 = A(1:K, 1:K) into the upper-triangular +* W1 = WORK(1:K, 1:K) column-by-column. +* + DO J = 1, K + CALL ZCOPY( J, A( 1, J ), 1, WORK( 1, J ), 1 ) + END DO +* +* Set the subdiagonal elements of W1 to zero column-by-column. +* + DO J = 1, K - 1 + DO I = J + 1, K + WORK( I, J ) = CZERO + END DO + END DO +* + IF( LNOTIDENT ) THEN +* +* col1_(2) Compute W1: = (V1**H) * W1 = (A1**H) * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL ZTRMM( 'L', 'L', 'C', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) + END IF +* +* col1_(3) Compute W1: = T * W1, +* T is upper-triangular, +* W1 is upper-triangular with zeroes below the diagonal. +* + CALL ZTRMM( 'L', 'U', 'N', 'N', K, K, CONE, T, LDT, + $ WORK, LDWORK ) +* +* col1_(4) Compute B1: = - V2 * W1 = - B1 * W1, +* V2 = B1, W1 is upper-triangular with zeroes below the diagonal. +* + IF( M.GT.0 ) THEN + CALL ZTRMM( 'R', 'U', 'N', 'N', M, K, -CONE, WORK, LDWORK, + $ B, LDB ) + END IF +* + IF( LNOTIDENT ) THEN +* +* col1_(5) Compute W1: = V1 * W1 = A1 * W1, +* V1 is not an identity matrix, but unit lower-triangular +* V1 stored in A1 (diagonal ones are not stored), +* W1 is upper-triangular on input with zeroes below the diagonal, +* and square on output. +* + CALL ZTRMM( 'L', 'L', 'N', 'U', K, K, CONE, A, LDA, + $ WORK, LDWORK ) +* +* col1_(6) Compute A1: = A1 - W1 = A(1:K, 1:K) - WORK(1:K, 1:K) +* column-by-column. A1 is upper-triangular on input. +* If IDENT, A1 is square on output, and W1 is square, +* if NOT IDENT, A1 is upper-triangular on output, +* W1 is upper-triangular. +* +* col1_(6)_a Compute elements of A1 below the diagonal. +* + DO J = 1, K - 1 + DO I = J + 1, K + A( I, J ) = - WORK( I, J ) + END DO + END DO +* + END IF +* +* col1_(6)_b Compute elements of A1 on and above the diagonal. +* + DO J = 1, K + DO I = 1, J + A( I, J ) = A( I, J ) - WORK( I, J ) + END DO + END DO +* + RETURN +* +* End of ZLARFB_GETT +* + END diff --git a/lapack-netlib/SRC/ztgsja.f b/lapack-netlib/SRC/ztgsja.f index 851f6504a..c80e33158 100644 --- a/lapack-netlib/SRC/ztgsja.f +++ b/lapack-netlib/SRC/ztgsja.f @@ -401,7 +401,7 @@ * .. Parameters .. INTEGER MAXIT PARAMETER ( MAXIT = 40 ) - DOUBLE PRECISION ZERO, ONE + DOUBLE PRECISION ZERO, ONE, HUGENUM PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) COMPLEX*16 CZERO, CONE PARAMETER ( CZERO = ( 0.0D+0, 0.0D+0 ), @@ -424,7 +424,8 @@ $ ZLASET, ZROT * .. * .. Intrinsic Functions .. - INTRINSIC ABS, DBLE, DCONJG, MAX, MIN + INTRINSIC ABS, DBLE, DCONJG, MAX, MIN, HUGE + PARAMETER ( HUGENUM = HUGE(ZERO) ) * .. * .. Executable Statements .. * @@ -610,9 +611,9 @@ * A1 = DBLE( A( K+I, N-L+I ) ) B1 = DBLE( B( I, N-L+I ) ) + GAMMA = B1 / A1 * - IF( A1.NE.ZERO ) THEN - GAMMA = B1 / A1 + IF( (GAMMA.LE.HUGENUM).AND.(GAMMA.GE.-HUGENUM) ) THEN * IF( GAMMA.LT.ZERO ) THEN CALL ZDSCAL( L-I+1, -ONE, B( I, N-L+I ), LDB ) diff --git a/lapack-netlib/SRC/zungbr.f b/lapack-netlib/SRC/zungbr.f index 3cdb8127d..c1c35822c 100644 --- a/lapack-netlib/SRC/zungbr.f +++ b/lapack-netlib/SRC/zungbr.f @@ -222,8 +222,8 @@ CALL ZUNGQR( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( M.GT.1 ) THEN - CALL ZUNGQR( M-1, M-1, M-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL ZUNGQR( M-1, M-1, M-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF ELSE @@ -231,8 +231,8 @@ CALL ZUNGLQ( M, N, K, A, LDA, TAU, WORK, -1, IINFO ) ELSE IF( N.GT.1 ) THEN - CALL ZUNGLQ( N-1, N-1, N-1, A( 2, 2 ), LDA, TAU, WORK, - $ -1, IINFO ) + CALL ZUNGLQ( N-1, N-1, N-1, A, LDA, TAU, WORK, -1, + $ IINFO ) END IF END IF END IF diff --git a/lapack-netlib/SRC/zungtsqr_row.f b/lapack-netlib/SRC/zungtsqr_row.f new file mode 100644 index 000000000..0d32ad6ce --- /dev/null +++ b/lapack-netlib/SRC/zungtsqr_row.f @@ -0,0 +1,380 @@ +*> \brief \b ZUNGTSQR_ROW +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +*> \htmlonly +*> Download ZUNGTSQR_ROW + dependencies +*> +*> [TGZ] +*> +*> [ZIP] +*> +*> [TXT] +*> \endhtmlonly +* +* Definition: +* =========== +* +* SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, +* $ LWORK, INFO ) +* IMPLICIT NONE +* +* .. Scalar Arguments .. +* INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. +* COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNGTSQR_ROW generates an M-by-N complex matrix Q_out with +*> orthonormal columns from the output of ZLATSQR. These N orthonormal +*> columns are the first N columns of a product of complex unitary +*> matrices Q(k)_in of order M, which are returned by ZLATSQR in +*> a special format. +*> +*> Q_out = first_N_columns_of( Q(1)_in * Q(2)_in * ... * Q(k)_in ). +*> +*> The input matrices Q(k)_in are stored in row and column blocks in A. +*> See the documentation of ZLATSQR for more details on the format of +*> Q(k)_in, where each Q(k)_in is represented by block Householder +*> transformations. This routine calls an auxiliary routine ZLARFB_GETT, +*> where the computation is performed on each individual block. The +*> algorithm first sweeps NB-sized column blocks from the right to left +*> starting in the bottom row block and continues to the top row block +*> (hence _ROW in the routine name). This sweep is in reverse order of +*> the order in which ZLATSQR generates the output blocks. +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> The number of rows of the matrix A. M >= 0. +*> \endverbatim +*> +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> The number of columns of the matrix A. M >= N >= 0. +*> \endverbatim +*> +*> \param[in] MB +*> \verbatim +*> MB is INTEGER +*> The row block size used by ZLATSQR to return +*> arrays A and T. MB > N. +*> (Note that if MB > M, then M is used instead of MB +*> as the row block size). +*> \endverbatim +*> +*> \param[in] NB +*> \verbatim +*> NB is INTEGER +*> The column block size used by ZLATSQR to return +*> arrays A and T. NB >= 1. +*> (Note that if NB > N, then N is used instead of NB +*> as the column block size). +*> \endverbatim +*> +*> \param[in,out] A +*> \verbatim +*> A is COMPLEX*16 array, dimension (LDA,N) +*> +*> On entry: +*> +*> The elements on and above the diagonal are not used as +*> input. The elements below the diagonal represent the unit +*> lower-trapezoidal blocked matrix V computed by ZLATSQR +*> that defines the input matrices Q_in(k) (ones on the +*> diagonal are not stored). See ZLATSQR for more details. +*> +*> On exit: +*> +*> The array A contains an M-by-N orthonormal matrix Q_out, +*> i.e the columns of A are orthogonal unit vectors. +*> \endverbatim +*> +*> \param[in] LDA +*> \verbatim +*> LDA is INTEGER +*> The leading dimension of the array A. LDA >= max(1,M). +*> \endverbatim +*> +*> \param[in] T +*> \verbatim +*> T is COMPLEX*16 array, +*> dimension (LDT, N * NIRB) +*> where NIRB = Number_of_input_row_blocks +*> = MAX( 1, CEIL((M-N)/(MB-N)) ) +*> Let NICB = Number_of_input_col_blocks +*> = CEIL(N/NB) +*> +*> The upper-triangular block reflectors used to define the +*> input matrices Q_in(k), k=(1:NIRB*NICB). The block +*> reflectors are stored in compact form in NIRB block +*> reflector sequences. Each of the NIRB block reflector +*> sequences is stored in a larger NB-by-N column block of T +*> and consists of NICB smaller NB-by-NB upper-triangular +*> column blocks. See ZLATSQR for more details on the format +*> of T. +*> \endverbatim +*> +*> \param[in] LDT +*> \verbatim +*> LDT is INTEGER +*> The leading dimension of the array T. +*> LDT >= max(1,min(NB,N)). +*> \endverbatim +*> +*> \param[out] WORK +*> \verbatim +*> (workspace) COMPLEX*16 array, dimension (MAX(1,LWORK)) +*> On exit, if INFO = 0, WORK(1) returns the optimal LWORK. +*> \endverbatim +*> +*> \param[in] LWORK +*> \verbatim +*> The dimension of the array WORK. +*> LWORK >= NBLOCAL * MAX(NBLOCAL,(N-NBLOCAL)), +*> where NBLOCAL=MIN(NB,N). +*> If LWORK = -1, then a workspace query is assumed. +*> The routine only calculates the optimal size of the WORK +*> array, returns this value as the first entry of the WORK +*> array, and no error message related to LWORK is issued +*> by XERBLA. +*> \endverbatim +*> +*> \param[out] INFO +*> \verbatim +*> INFO is INTEGER +*> = 0: successful exit +*> < 0: if INFO = -i, the i-th argument had an illegal value +*> \endverbatim +*> +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16OTHERcomputational +* +*> \par Contributors: +* ================== +*> +*> \verbatim +*> +*> November 2020, Igor Kozachenko, +*> Computer Science Division, +*> University of California, Berkeley +*> +*> \endverbatim +*> +* ===================================================================== + SUBROUTINE ZUNGTSQR_ROW( M, N, MB, NB, A, LDA, T, LDT, WORK, + $ LWORK, INFO ) + IMPLICIT NONE +* +* -- LAPACK computational routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER INFO, LDA, LDT, LWORK, M, N, MB, NB +* .. +* .. Array Arguments .. + COMPLEX*16 A( LDA, * ), T( LDT, * ), WORK( * ) +* .. +* +* ===================================================================== +* +* .. Parameters .. + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL LQUERY + INTEGER NBLOCAL, MB2, M_PLUS_ONE, ITMP, IB_BOTTOM, + $ LWORKOPT, NUM_ALL_ROW_BLOCKS, JB_T, IB, IMB, + $ KB, KB_LAST, KNB, MB1 +* .. +* .. Local Arrays .. + COMPLEX*16 DUMMY( 1, 1 ) +* .. +* .. External Subroutines .. + EXTERNAL ZLARFB_GETT, ZLASET, XERBLA +* .. +* .. Intrinsic Functions .. + INTRINSIC DCMPLX, MAX, MIN +* .. +* .. Executable Statements .. +* +* Test the input parameters +* + INFO = 0 + LQUERY = LWORK.EQ.-1 + IF( M.LT.0 ) THEN + INFO = -1 + ELSE IF( N.LT.0 .OR. M.LT.N ) THEN + INFO = -2 + ELSE IF( MB.LE.N ) THEN + INFO = -3 + ELSE IF( NB.LT.1 ) THEN + INFO = -4 + ELSE IF( LDA.LT.MAX( 1, M ) ) THEN + INFO = -6 + ELSE IF( LDT.LT.MAX( 1, MIN( NB, N ) ) ) THEN + INFO = -8 + ELSE IF( LWORK.LT.1 .AND. .NOT.LQUERY ) THEN + INFO = -10 + END IF +* + NBLOCAL = MIN( NB, N ) +* +* Determine the workspace size. +* + IF( INFO.EQ.0 ) THEN + LWORKOPT = NBLOCAL * MAX( NBLOCAL, ( N - NBLOCAL ) ) + END IF +* +* Handle error in the input parameters and handle the workspace query. +* + IF( INFO.NE.0 ) THEN + CALL XERBLA( 'ZUNGTSQR_ROW', -INFO ) + RETURN + ELSE IF ( LQUERY ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* Quick return if possible +* + IF( MIN( M, N ).EQ.0 ) THEN + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN + END IF +* +* (0) Set the upper-triangular part of the matrix A to zero and +* its diagonal elements to one. +* + CALL ZLASET('U', M, N, CZERO, CONE, A, LDA ) +* +* KB_LAST is the column index of the last column block reflector +* in the matrices T and V. +* + KB_LAST = ( ( N-1 ) / NBLOCAL ) * NBLOCAL + 1 +* +* +* (1) Bottom-up loop over row blocks of A, except the top row block. +* NOTE: If MB>=M, then the loop is never executed. +* + IF ( MB.LT.M ) THEN +* +* MB2 is the row blocking size for the row blocks before the +* first top row block in the matrix A. IB is the row index for +* the row blocks in the matrix A before the first top row block. +* IB_BOTTOM is the row index for the last bottom row block +* in the matrix A. JB_T is the column index of the corresponding +* column block in the matrix T. +* +* Initialize variables. +* +* NUM_ALL_ROW_BLOCKS is the number of row blocks in the matrix A +* including the first row block. +* + MB2 = MB - N + M_PLUS_ONE = M + 1 + ITMP = ( M - MB - 1 ) / MB2 + IB_BOTTOM = ITMP * MB2 + MB + 1 + NUM_ALL_ROW_BLOCKS = ITMP + 2 + JB_T = NUM_ALL_ROW_BLOCKS * N + 1 +* + DO IB = IB_BOTTOM, MB+1, -MB2 +* +* Determine the block size IMB for the current row block +* in the matrix A. +* + IMB = MIN( M_PLUS_ONE - IB, MB2 ) +* +* Determine the column index JB_T for the current column block +* in the matrix T. +* + JB_T = JB_T - N +* +* Apply column blocks of H in the row block from right to left. +* +* KB is the column index of the current column block reflector +* in the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + CALL ZLARFB_GETT( 'I', IMB, N-KB+1, KNB, + $ T( 1, JB_T+KB-1 ), LDT, A( KB, KB ), LDA, + $ A( IB, KB ), LDA, WORK, KNB ) +* + END DO +* + END DO +* + END IF +* +* (2) Top row block of A. +* NOTE: If MB>=M, then we have only one row block of A of size M +* and we work on the entire matrix A. +* + MB1 = MIN( MB, M ) +* +* Apply column blocks of H in the top row block from right to left. +* +* KB is the column index of the current block reflector in +* the matrices T and V. +* + DO KB = KB_LAST, 1, -NBLOCAL +* +* Determine the size of the current column block KNB in +* the matrices T and V. +* + KNB = MIN( NBLOCAL, N - KB + 1 ) +* + IF( MB1-KB-KNB+1.EQ.0 ) THEN +* +* In SLARFB_GETT parameters, when M=0, then the matrix B +* does not exist, hence we need to pass a dummy array +* reference DUMMY(1,1) to B with LDDUMMY=1. +* + CALL ZLARFB_GETT( 'N', 0, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ DUMMY( 1, 1 ), 1, WORK, KNB ) + ELSE + CALL ZLARFB_GETT( 'N', MB1-KB-KNB+1, N-KB+1, KNB, + $ T( 1, KB ), LDT, A( KB, KB ), LDA, + $ A( KB+KNB, KB), LDA, WORK, KNB ) + + END IF +* + END DO +* + WORK( 1 ) = DCMPLX( LWORKOPT ) + RETURN +* +* End of ZUNGTSQR_ROW +* + END diff --git a/lapack-netlib/TESTING/EIG/cchkee.F b/lapack-netlib/TESTING/EIG/cchkee.F index 0d3d7493c..de4aed696 100644 --- a/lapack-netlib/TESTING/EIG/cchkee.F +++ b/lapack-netlib/TESTING/EIG/cchkee.F @@ -1871,7 +1871,7 @@ CALL XLAENV( 9, 25 ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL CERRST( 'CST', NOUT ) @@ -2338,7 +2338,7 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL CERRST( 'CHB', NOUT ) diff --git a/lapack-netlib/TESTING/EIG/dchkee.F b/lapack-netlib/TESTING/EIG/dchkee.F index ee22ce33d..00e8eb57f 100644 --- a/lapack-netlib/TESTING/EIG/dchkee.F +++ b/lapack-netlib/TESTING/EIG/dchkee.F @@ -1876,7 +1876,7 @@ CALL XLAENV( 9, 25 ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL DERRST( 'DST', NOUT ) diff --git a/lapack-netlib/TESTING/EIG/schkee.F b/lapack-netlib/TESTING/EIG/schkee.F index a063c18b5..c3f9ca162 100644 --- a/lapack-netlib/TESTING/EIG/schkee.F +++ b/lapack-netlib/TESTING/EIG/schkee.F @@ -1877,7 +1877,7 @@ CALL XLAENV( 9, 25 ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL SERRST( 'SST', NOUT ) diff --git a/lapack-netlib/TESTING/EIG/zchkee.F b/lapack-netlib/TESTING/EIG/zchkee.F index 29604956d..908b7d651 100644 --- a/lapack-netlib/TESTING/EIG/zchkee.F +++ b/lapack-netlib/TESTING/EIG/zchkee.F @@ -1871,7 +1871,7 @@ CALL XLAENV( 9, 25 ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL ZERRST( 'ZST', NOUT ) @@ -2336,7 +2336,7 @@ CALL ALAREQ( C3, NTYPES, DOTYPE, MAXTYP, NIN, NOUT ) IF( TSTERR ) THEN #if defined(_OPENMP) - N_THREADS = OMP_GET_NUM_THREADS() + N_THREADS = OMP_GET_MAX_THREADS() CALL OMP_SET_NUM_THREADS(1) #endif CALL ZERRST( 'ZHB', NOUT ) diff --git a/lapack-netlib/TESTING/LIN/CMakeLists.txt b/lapack-netlib/TESTING/LIN/CMakeLists.txt index 0d0bb5418..309ed7e77 100644 --- a/lapack-netlib/TESTING/LIN/CMakeLists.txt +++ b/lapack-netlib/TESTING/LIN/CMakeLists.txt @@ -40,7 +40,7 @@ set(SLINTST schkaa.f sgennd.f sqrt04.f sqrt05.f schkqrt.f serrqrt.f schkqrtp.f serrqrtp.f schklqt.f schklqtp.f schktsqr.f serrlqt.f serrlqtp.f serrtsqr.f stsqr01.f slqt04.f slqt05.f - schkorhr_col.f serrorhr_col.f sorhr_col01.f) + schkorhr_col.f serrorhr_col.f sorhr_col01.f sorhr_col02.f) if(USE_XBLAS) list(APPEND SLINTST sdrvgbx.f sdrvgex.f sdrvsyx.f sdrvpox.f @@ -96,7 +96,7 @@ set(CLINTST cchkaa.f cqrt04.f cqrt05.f cchkqrt.f cerrqrt.f cchkqrtp.f cerrqrtp.f cchklqt.f cchklqtp.f cchktsqr.f cerrlqt.f cerrlqtp.f cerrtsqr.f ctsqr01.f clqt04.f clqt05.f - cchkunhr_col.f cerrunhr_col.f cunhr_col01.f) + cchkunhr_col.f cerrunhr_col.f cunhr_col01.f cunhr_col02.f) if(USE_XBLAS) list(APPEND CLINTST cdrvgbx.f cdrvgex.f cdrvhex.f cdrvsyx.f cdrvpox.f @@ -142,7 +142,7 @@ set(DLINTST dchkaa.f dqrt04.f dqrt05.f dchkqrt.f derrqrt.f dchkqrtp.f derrqrtp.f dchklq.f dchklqt.f dchklqtp.f dchktsqr.f derrlqt.f derrlqtp.f derrtsqr.f dtsqr01.f dlqt04.f dlqt05.f - dchkorhr_col.f derrorhr_col.f dorhr_col01.f) + dchkorhr_col.f derrorhr_col.f dorhr_col01.f dorhr_col02.f) if(USE_XBLAS) list(APPEND DLINTST ddrvgbx.f ddrvgex.f ddrvsyx.f ddrvpox.f @@ -198,7 +198,7 @@ set(ZLINTST zchkaa.f zqrt04.f zqrt05.f zchkqrt.f zerrqrt.f zchkqrtp.f zerrqrtp.f zchklqt.f zchklqtp.f zchktsqr.f zerrlqt.f zerrlqtp.f zerrtsqr.f ztsqr01.f zlqt04.f zlqt05.f - zchkunhr_col.f zerrunhr_col.f zunhr_col01.f) + zchkunhr_col.f zerrunhr_col.f zunhr_col01.f zunhr_col02.f) if(USE_XBLAS) list(APPEND ZLINTST zdrvgbx.f zdrvgex.f zdrvhex.f zdrvsyx.f zdrvpox.f diff --git a/lapack-netlib/TESTING/LIN/Makefile b/lapack-netlib/TESTING/LIN/Makefile index 6e790aa93..674265816 100644 --- a/lapack-netlib/TESTING/LIN/Makefile +++ b/lapack-netlib/TESTING/LIN/Makefile @@ -74,7 +74,7 @@ SLINTST = schkaa.o \ sgennd.o sqrt04.o sqrt05.o schkqrt.o serrqrt.o schkqrtp.o serrqrtp.o \ schklqt.o schklqtp.o schktsqr.o \ serrlqt.o serrlqtp.o serrtsqr.o stsqr01.o slqt04.o slqt05.o \ - schkorhr_col.o serrorhr_col.o sorhr_col01.o + schkorhr_col.o serrorhr_col.o sorhr_col01.o sorhr_col02.o ifdef USEXBLAS SLINTST += sdrvgbx.o sdrvgex.o sdrvsyx.o sdrvpox.o \ @@ -123,7 +123,7 @@ CLINTST = cchkaa.o \ cqrt04.o cqrt05.o cchkqrt.o cerrqrt.o cchkqrtp.o cerrqrtp.o \ cchklqt.o cchklqtp.o cchktsqr.o \ cerrlqt.o cerrlqtp.o cerrtsqr.o ctsqr01.o clqt04.o clqt05.o \ - cchkunhr_col.o cerrunhr_col.o cunhr_col01.o + cchkunhr_col.o cerrunhr_col.o cunhr_col01.o cunhr_col02.o ifdef USEXBLAS CLINTST += cdrvgbx.o cdrvgex.o cdrvhex.o cdrvsyx.o cdrvpox.o \ @@ -167,7 +167,7 @@ DLINTST = dchkaa.o \ dqrt04.o dqrt05.o dchkqrt.o derrqrt.o dchkqrtp.o derrqrtp.o \ dchklq.o dchklqt.o dchklqtp.o dchktsqr.o \ derrlqt.o derrlqtp.o derrtsqr.o dtsqr01.o dlqt04.o dlqt05.o \ - dchkorhr_col.o derrorhr_col.o dorhr_col01.o + dchkorhr_col.o derrorhr_col.o dorhr_col01.o dorhr_col02.o ifdef USEXBLAS DLINTST += ddrvgbx.o ddrvgex.o ddrvsyx.o ddrvpox.o \ @@ -215,7 +215,7 @@ ZLINTST = zchkaa.o \ zqrt04.o zqrt05.o zchkqrt.o zerrqrt.o zchkqrtp.o zerrqrtp.o \ zchklqt.o zchklqtp.o zchktsqr.o \ zerrlqt.o zerrlqtp.o zerrtsqr.o ztsqr01.o zlqt04.o zlqt05.o \ - zchkunhr_col.o zerrunhr_col.o zunhr_col01.o + zchkunhr_col.o zerrunhr_col.o zunhr_col01.o zunhr_col02.o ifdef USEXBLAS ZLINTST += zdrvgbx.o zdrvgex.o zdrvhex.o zdrvsyx.o zdrvpox.o \ diff --git a/lapack-netlib/TESTING/LIN/cchkunhr_col.f b/lapack-netlib/TESTING/LIN/cchkunhr_col.f index 00077ddd9..0d6a9063d 100644 --- a/lapack-netlib/TESTING/LIN/cchkunhr_col.f +++ b/lapack-netlib/TESTING/LIN/cchkunhr_col.f @@ -24,9 +24,12 @@ *> *> \verbatim *> -*> CCHKUNHR_COL tests CUNHR_COL using CLATSQR and CGEMQRT. Therefore, CLATSQR -*> (used in CGEQR) and CGEMQRT (used in CGEMQR) have to be tested -*> before this test. +*> CCHKUNHR_COL tests: +*> 1) CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT, +*> 2) CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT +*> (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR) +*> have to be tested before this test. *> *> \endverbatim * @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex_lin * * ===================================================================== - SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE CCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,10 +135,11 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01 + EXTERNAL ALAHD, ALASUM, CERRUNHR_COL, CUNHR_COL01, + $ CUNHR_COL02 * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX, MIN * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -201,8 +202,8 @@ * * Test CUNHR_COL * - CALL CUNHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL CUNHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test CUNHR_COL +* + CALL CUNHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'CUNGTSQR and CUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'CUNGTSQR_ROW and CUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of CCHKUNHR_COL diff --git a/lapack-netlib/TESTING/LIN/cdrvgex.f b/lapack-netlib/TESTING/LIN/cdrvgex.f index 51fc84899..9b075908f 100644 --- a/lapack-netlib/TESTING/LIN/cdrvgex.f +++ b/lapack-netlib/TESTING/LIN/cdrvgex.f @@ -707,9 +707,10 @@ CALL CLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA ) IF( .NOT.PREFAC ) - $ CALL CLASET( 'Full', N, N, ZERO, ZERO, AFAC, - $ LDA ) - CALL CLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA ) + $ CALL CLASET( 'Full', N, N, CMPLX( ZERO ), + $ CMPLX( ZERO ), AFAC, LDA ) + CALL CLASET( 'Full', N, NRHS, CMPLX( ZERO ), + $ CMPLX( ZERO ), X, LDA ) IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN * * Equilibrate the matrix if FACT = 'F' and diff --git a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f index 32be41f64..959258e1f 100644 --- a/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f +++ b/lapack-netlib/TESTING/LIN/cdrvhe_aa_2stage.f @@ -449,11 +449,11 @@ * Reconstruct matrix from factors and compute * residual. * -c CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA, -c $ IWORK, AINV, LDA, RWORK, -c $ RESULT( 2 ) ) -c NT = 2 - NT = 1 +c CALL CHET01_AA( UPLO, N, A, LDA, AFAC, LDA, +c $ IWORK, AINV, LDA, RWORK, +c $ RESULT( 2 ) ) +c NT = 2 + NT = 1 * * Print information about the tests that did not pass * the threshold. diff --git a/lapack-netlib/TESTING/LIN/cdrvrfp.f b/lapack-netlib/TESTING/LIN/cdrvrfp.f index a57688f83..362a0e7cb 100644 --- a/lapack-netlib/TESTING/LIN/cdrvrfp.f +++ b/lapack-netlib/TESTING/LIN/cdrvrfp.f @@ -449,19 +449,19 @@ * Form the inverse of A. * CALL CPOTRI( UPLO, N, A, LDA, INFO ) + + IF ( N .NE. 0 ) THEN * -* Compute the 1-norm condition number of A. +* Compute the 1-norm condition number of A. * - IF ( N .NE. 0 ) THEN AINVNM = CLANHE( '1', UPLO, N, A, LDA, + S_WORK_CLANHE ) RCONDC = ( ONE / ANORM ) / AINVNM * * Restore the matrix A. * - CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA ) + CALL CLACPY( UPLO, N, N, ASAV, LDA, A, LDA ) END IF - * END IF * diff --git a/lapack-netlib/TESTING/LIN/cunhr_col01.f b/lapack-netlib/TESTING/LIN/cunhr_col01.f index d760caba5..d77d60b1a 100644 --- a/lapack-netlib/TESTING/LIN/cunhr_col01.f +++ b/lapack-netlib/TESTING/LIN/cunhr_col01.f @@ -13,7 +13,7 @@ * .. Scalar Arguments .. * INTEGER M, N, MB1, NB1, NB2 * .. Return values .. -* REAL RESULT(6) +* DOUBLE PRECISION RESULT(6) * * *> \par Purpose: @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> CUNHR_COL01 tests CUNHR_COL using CLATSQR, CGEMQRT and CUNGTSQR. -*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part CGEMQR), CUNGTSQR +*> CUNHR_COL01 tests CUNGTSQR and CUNHR_COL using CLATSQR, CGEMQRT. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is REAL array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in CGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using CGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using CGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* -*> \ingroup complex16_lin +*> \ingroup complex_lin * * ===================================================================== SUBROUTINE CUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 @@ -102,10 +131,10 @@ * * .. * .. Local allocatable arrays - COMPLEX, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + COMPLEX , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), $ C(:,:), CF(:,:), D(:,:), DF(:,:) - REAL, ALLOCATABLE :: RWORK(:) + REAL , ALLOCATABLE :: RWORK(:) * * .. Parameters .. REAL ZERO @@ -218,7 +247,7 @@ * Copy the factor R into the array R. * SRNAMT = 'CLACPY' - CALL CLACPY( 'U', M, N, AF, M, R, M ) + CALL CLACPY( 'U', N, N, AF, M, R, M ) * * Reconstruct the orthogonal matrix Q. * @@ -240,7 +269,7 @@ * matrix S. * SRNAMT = 'CLACPY' - CALL CLACPY( 'U', M, N, R, M, AF, M ) + CALL CLACPY( 'U', N, N, R, M, AF, M ) * DO I = 1, N IF( DIAG( I ).EQ.-CONE ) THEN diff --git a/lapack-netlib/TESTING/LIN/cunhr_col02.f b/lapack-netlib/TESTING/LIN/cunhr_col02.f new file mode 100644 index 000000000..001f291da --- /dev/null +++ b/lapack-netlib/TESTING/LIN/cunhr_col02.f @@ -0,0 +1,381 @@ +*> \brief \b CUNHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* REAL RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> CUNHR_COL02 tests CUNGTSQR_ROW and CUNHR_COL inside CGETSQRHRT +*> (which calls CLATSQR, CUNGTSQR_ROW and CUNHR_COL) using CGEMQRT. +*> Therefore, CLATSQR (part of CGEQR), CGEMQRT (part of CGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is REAL array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in CGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using CGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using CGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex_lin +* +* ===================================================================== + SUBROUTINE CUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + REAL RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + COMPLEX , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) + REAL , ALLOCATABLE :: RWORK(:) +* +* .. Parameters .. + REAL ZERO + PARAMETER ( ZERO = 0.0E+0 ) + COMPLEX CONE, CZERO + PARAMETER ( CONE = ( 1.0E+0, 0.0E+0 ), + $ CZERO = ( 0.0E+0, 0.0E+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + REAL ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + COMPLEX WORKQUERY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, CLANGE, CLANSY + EXTERNAL SLAMCH, CLANGE, CLANSY +* .. +* .. External Subroutines .. + EXTERNAL CLACPY, CLARNV, CLASET, CGETSQRHRT, + $ CSCAL, CGEMM, CGEMQRT, CHERK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = SLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL CLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL CLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL CLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in CLATSQR +* + NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* CGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* +* + CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In CGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'CGETSQRHRT' + CALL CGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL CLASET( 'Full', M, M, CZERO, CONE, Q, M ) +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL CLASET( 'Full', M, N, CZERO, CZERO, R, M ) +* + CALL CLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M ) +* + ANORM = CLANGE( '1', M, N, A, M, RWORK ) + RESID = CLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL CLASET( 'Full', M, M, CZERO, CONE, R, M ) + CALL CHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M ) + RESID = CLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL CLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = CLANGE( '1', M, N, C, M, RWORK ) + CALL CLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL CGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = CLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL CLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL CGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = CLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL CLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = CLANGE( '1', N, M, D, N, RWORK ) + CALL CLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL CGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = CLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL CLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'CGEMQRT' + CALL CGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL CGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = CLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of CUNHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/LIN/dchkorhr_col.f b/lapack-netlib/TESTING/LIN/dchkorhr_col.f index 3b3e421eb..0e2d44d8d 100644 --- a/lapack-netlib/TESTING/LIN/dchkorhr_col.f +++ b/lapack-netlib/TESTING/LIN/dchkorhr_col.f @@ -24,9 +24,12 @@ *> *> \verbatim *> -*> DCHKORHR_COL tests DORHR_COL using DLATSQR and DGEMQRT. Therefore, DLATSQR -*> (used in DGEQR) and DGEMQRT (used in DGEMQR) have to be tested -*> before this test. +*> DCHKORHR_COL tests: +*> 1) DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT, +*> 2) DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT +*> (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR) +*> have to be tested before this test. *> *> \endverbatim * @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup double_lin * * ===================================================================== - SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE DCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,10 +135,11 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01 + EXTERNAL ALAHD, ALASUM, DERRORHR_COL, DORHR_COL01, + $ DORHR_COL02 * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX, MIN * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -201,8 +202,8 @@ * * Test DORHR_COL * - CALL DORHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL DORHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test DORHR_COL +* + CALL DORHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'DORGTSQR and DORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'DORGTSQR_ROW and DORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of DCHKORHR_COL diff --git a/lapack-netlib/TESTING/LIN/ddrvrfp.f b/lapack-netlib/TESTING/LIN/ddrvrfp.f index d67cf6713..18ccbdfc4 100644 --- a/lapack-netlib/TESTING/LIN/ddrvrfp.f +++ b/lapack-netlib/TESTING/LIN/ddrvrfp.f @@ -443,8 +443,7 @@ * CALL DPOTRI( UPLO, N, A, LDA, INFO ) - IF ( N .NE. 0 ) THEN - + IF ( N .NE. 0 ) THEN * * Compute the 1-norm condition number of A. * diff --git a/lapack-netlib/TESTING/LIN/dorhr_col01.f b/lapack-netlib/TESTING/LIN/dorhr_col01.f index 3e48de37f..979255ca9 100644 --- a/lapack-netlib/TESTING/LIN/dorhr_col01.f +++ b/lapack-netlib/TESTING/LIN/dorhr_col01.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> DORHR_COL01 tests DORHR_COL using DLATSQR, DGEMQRT and DORGTSQR. -*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part DGEMQR), DORGTSQR +*> DORHR_COL01 tests DORGTSQR and DORHR_COL using DLATSQR, DGEMQRT. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is DOUBLE PRECISION array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using DGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using DGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* -*> \ingroup single_lin +*> \ingroup double_lin * * ===================================================================== SUBROUTINE DORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 diff --git a/lapack-netlib/TESTING/LIN/dorhr_col02.f b/lapack-netlib/TESTING/LIN/dorhr_col02.f new file mode 100644 index 000000000..d4c438edb --- /dev/null +++ b/lapack-netlib/TESTING/LIN/dorhr_col02.f @@ -0,0 +1,377 @@ +*> \brief \b DORHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* DOUBLE PRECISION RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> DORHR_COL02 tests DORGTSQR_ROW and DORHR_COL inside DGETSQRHRT +*> (which calls DLATSQR, DORGTSQR_ROW and DORHR_COL) using DGEMQRT. +*> Therefore, DLATSQR (part of DGEQR), DGEMQRT (part of DGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is DOUBLE PRECISION array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using DGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using DGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup double_lin +* +* ===================================================================== + SUBROUTINE DORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + DOUBLE PRECISION RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + DOUBLE PRECISION, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) +* +* .. Parameters .. + DOUBLE PRECISION ONE, ZERO + PARAMETER ( ZERO = 0.0D+0, ONE = 1.0D+0 ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + DOUBLE PRECISION ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + DOUBLE PRECISION WORKQUERY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, DLANGE, DLANSY + EXTERNAL DLAMCH, DLANGE, DLANSY +* .. +* .. External Subroutines .. + EXTERNAL DLACPY, DLARNV, DLASET, DGETSQRHRT, + $ DSCAL, DGEMM, DGEMQRT, DSYRK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = DLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL DLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL DLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL DLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in DLATSQR +* + NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* DGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* +* + CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In DGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'DGETSQRHRT' + CALL DGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL DLASET( 'Full', M, M, ZERO, ONE, Q, M ) +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL DLASET( 'Full', M, N, ZERO, ZERO, R, M ) +* + CALL DLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M ) +* + ANORM = DLANGE( '1', M, N, A, M, RWORK ) + RESID = DLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL DLASET( 'Full', M, M, ZERO, ONE, R, M ) + CALL DSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M ) + RESID = DLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL DLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = DLANGE( '1', M, N, C, M, RWORK ) + CALL DLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL DGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = DLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL DLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL DGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = DLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL DLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = DLANGE( '1', N, M, D, N, RWORK ) + CALL DLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL DGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = DLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL DLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'DGEMQRT' + CALL DGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL DGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = DLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of DORHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/LIN/schkorhr_col.f b/lapack-netlib/TESTING/LIN/schkorhr_col.f index cf6d2d323..f61b74902 100644 --- a/lapack-netlib/TESTING/LIN/schkorhr_col.f +++ b/lapack-netlib/TESTING/LIN/schkorhr_col.f @@ -24,8 +24,11 @@ *> *> \verbatim *> -*> SCHKORHR_COL tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR. -*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR +*> SCHKORHR_COL tests: +*> 1) SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT, +*> 2) SORGTSQR_ROW and SORHR_COL inside DGETSQRHRT +*> (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* -*> \ingroup sigle_lin +*> \ingroup single_lin * * ===================================================================== - SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE SCHKORHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* June 2019 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,7 +135,8 @@ REAL RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01 + EXTERNAL ALAHD, ALASUM, SERRORHR_COL, SORHR_COL01, + $ SORHR_COL02 * .. * .. Intrinsic Functions .. INTRINSIC MAX, MIN @@ -201,8 +202,8 @@ * * Test SORHR_COL * - CALL SORHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL SORHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test SORHR_COL +* + CALL SORHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'SORGTSQR and SORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'SORGTSQR_ROW and SORHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of SCHKORHR_COL diff --git a/lapack-netlib/TESTING/LIN/sdrvrfp.f b/lapack-netlib/TESTING/LIN/sdrvrfp.f index 4b022bcfb..c0eb4d564 100644 --- a/lapack-netlib/TESTING/LIN/sdrvrfp.f +++ b/lapack-netlib/TESTING/LIN/sdrvrfp.f @@ -443,7 +443,7 @@ * CALL SPOTRI( UPLO, N, A, LDA, INFO ) - IF ( N .NE. 0 ) THEN + IF ( N .NE. 0 ) THEN * * Compute the 1-norm condition number of A. * diff --git a/lapack-netlib/TESTING/LIN/sorhr_col01.f b/lapack-netlib/TESTING/LIN/sorhr_col01.f index 02429041b..dcc2c1cae 100644 --- a/lapack-netlib/TESTING/LIN/sorhr_col01.f +++ b/lapack-netlib/TESTING/LIN/sorhr_col01.f @@ -8,12 +8,12 @@ * Definition: * =========== * -* SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT) +* SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) * * .. Scalar Arguments .. * INTEGER M, N, MB1, NB1, NB2 * .. Return values .. -* REAL RESULT(6) +* REAL RESULT(6) * * *> \par Purpose: @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> SORHR_COL01 tests SORHR_COL using SLATSQR, SGEMQRT and SORGTSQR. -*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part SGEMQR), SORGTSQR +*> SORHR_COL01 tests SORGTSQR and SORHR_COL using SLATSQR, SGEMQRT. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is REAL array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in SGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using SGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using SGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup single_lin * * ===================================================================== SUBROUTINE SORHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 @@ -102,7 +131,7 @@ * * .. * .. Local allocatable arrays - REAL, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + REAL , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), $ C(:,:), CF(:,:), D(:,:), DF(:,:) * @@ -128,7 +157,7 @@ $ SORGTSQR, SSCAL, SGEMM, SGEMQRT, SSYRK * .. * .. Intrinsic Functions .. - INTRINSIC CEILING, MAX, MIN, REAL + INTRINSIC CEILING, REAL, MAX, MIN * .. * .. Scalars in Common .. CHARACTER(LEN=32) SRNAMT @@ -230,7 +259,7 @@ * * Compute the factor R_hr corresponding to the Householder * reconstructed Q_hr and place it in the upper triangle of AF to -* match the Q storage format in DGEQRT. R_hr = R_tsqr * S, +* match the Q storage format in SGEQRT. R_hr = R_tsqr * S, * this means changing the sign of I-th row of the matrix R_tsqr * according to sign of of I-th diagonal element DIAG(I) of the * matrix S. diff --git a/lapack-netlib/TESTING/LIN/sorhr_col02.f b/lapack-netlib/TESTING/LIN/sorhr_col02.f new file mode 100644 index 000000000..1cbe40577 --- /dev/null +++ b/lapack-netlib/TESTING/LIN/sorhr_col02.f @@ -0,0 +1,376 @@ +*> \brief \b SORHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* REAL RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> SORHR_COL02 tests SORGTSQR_ROW and SORHR_COL inside SGETSQRHRT +*> (which calls SLATSQR, SORGTSQR_ROW and SORHR_COL) using SGEMQRT. +*> Therefore, SLATSQR (part of SGEQR), SGEMQRT (part of SGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is REAL array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m orthogonal Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in SGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m orthogonal matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using SGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using SGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup single_lin +* +* ===================================================================== + SUBROUTINE SORHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + REAL RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + REAL , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ RWORK(:), WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) +* +* .. Parameters .. + REAL ONE, ZERO + PARAMETER ( ZERO = 0.0E+0, ONE = 1.0E+0 ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + REAL ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + REAL WORKQUERY( 1 ) +* .. +* .. External Functions .. + REAL SLAMCH, SLANGE, SLANSY + EXTERNAL SLAMCH, SLANGE, SLANSY +* .. +* .. External Subroutines .. + EXTERNAL SLACPY, SLARNV, SLASET, SGETSQRHRT, + $ SSCAL, SGEMM, SGEMQRT, SSYRK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, REAL, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = SLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL SLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL SLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL SLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in SLATSQR +* + NRB = MAX( 1, CEILING( REAL( M - N ) / REAL( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* SGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* + CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In SGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'SGETSQRHRT' + CALL SGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL SLASET( 'Full', M, M, ZERO, ONE, Q, M ) +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL SLASET( 'Full', M, N, ZERO, ZERO, R, M ) +* + CALL SLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, A, M, ONE, R, M ) +* + ANORM = SLANGE( '1', M, N, A, M, RWORK ) + RESID = SLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL SLASET( 'Full', M, M, ZERO, ONE, R, M ) + CALL SSYRK( 'U', 'T', M, M, -ONE, Q, M, ONE, R, M ) + RESID = SLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL SLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = SLANGE( '1', M, N, C, M, RWORK ) + CALL SLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL SGEMM( 'N', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = SLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL SLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'L', 'T', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL SGEMM( 'T', 'N', M, N, M, -ONE, Q, M, C, M, ONE, CF, M ) + RESID = SLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL SLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = SLANGE( '1', N, M, D, N, RWORK ) + CALL SLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL SGEMM( 'N', 'N', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = SLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL SLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'SGEMQRT' + CALL SGEMQRT( 'R', 'T', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL SGEMM( 'N', 'T', N, M, M, -ONE, D, N, Q, M, ONE, DF, N ) + RESID = SLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of SORHR_COL02 +* + END diff --git a/lapack-netlib/TESTING/LIN/zchkunhr_col.f b/lapack-netlib/TESTING/LIN/zchkunhr_col.f index ef8f8bcc4..395ea178a 100644 --- a/lapack-netlib/TESTING/LIN/zchkunhr_col.f +++ b/lapack-netlib/TESTING/LIN/zchkunhr_col.f @@ -24,9 +24,12 @@ *> *> \verbatim *> -*> ZCHKUNHR_COL tests ZUNHR_COL using ZLATSQR and ZGEMQRT. Therefore, ZLATSQR -*> (used in ZGEQR) and ZGEMQRT (used in ZGEMQR) have to be tested -*> before this test. +*> ZCHKUNHR_COL tests: +*> 1) ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT, +*> 2) ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT +*> (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR) +*> have to be tested before this test. *> *> \endverbatim * @@ -97,19 +100,16 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex16_lin * * ===================================================================== - SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, NNB, - $ NBVAL, NOUT ) + SUBROUTINE ZCHKUNHR_COL( THRESH, TSTERR, NM, MVAL, NN, NVAL, + $ NNB, NBVAL, NOUT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.7.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* December 2016 * * .. Scalar Arguments .. LOGICAL TSTERR @@ -135,10 +135,11 @@ DOUBLE PRECISION RESULT( NTESTS ) * .. * .. External Subroutines .. - EXTERNAL ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01 + EXTERNAL ALAHD, ALASUM, ZERRUNHR_COL, ZUNHR_COL01, + $ ZUNHR_COL02 * .. * .. Intrinsic Functions .. - INTRINSIC MAX, MIN + INTRINSIC MAX, MIN * .. * .. Scalars in Common .. LOGICAL LERR, OK @@ -201,8 +202,8 @@ * * Test ZUNHR_COL * - CALL ZUNHR_COL01( M, N, MB1, NB1, NB2, - $ RESULT ) + CALL ZUNHR_COL01( M, N, MB1, NB1, + $ NB2, RESULT ) * * Print information about the tests that did * not pass the threshold. @@ -226,12 +227,78 @@ END DO END DO * +* Do for each value of M in MVAL. +* + DO I = 1, NM + M = MVAL( I ) +* +* Do for each value of N in NVAL. +* + DO J = 1, NN + N = NVAL( J ) +* +* Only for M >= N +* + IF ( MIN( M, N ).GT.0 .AND. M.GE.N ) THEN +* +* Do for each possible value of MB1 +* + DO IMB1 = 1, NNB + MB1 = NBVAL( IMB1 ) +* +* Only for MB1 > N +* + IF ( MB1.GT.N ) THEN +* +* Do for each possible value of NB1 +* + DO INB1 = 1, NNB + NB1 = NBVAL( INB1 ) +* +* Do for each possible value of NB2 +* + DO INB2 = 1, NNB + NB2 = NBVAL( INB2 ) +* + IF( NB1.GT.0 .AND. NB2.GT.0 ) THEN +* +* Test ZUNHR_COL +* + CALL ZUNHR_COL02( M, N, MB1, NB1, + $ NB2, RESULT ) +* +* Print information about the tests that did +* not pass the threshold. +* + DO T = 1, NTESTS + IF( RESULT( T ).GE.THRESH ) THEN + IF( NFAIL.EQ.0 .AND. NERRS.EQ.0 ) + $ CALL ALAHD( NOUT, PATH ) + WRITE( NOUT, FMT = 9998 ) M, N, MB1, + $ NB1, NB2, T, RESULT( T ) + NFAIL = NFAIL + 1 + END IF + END DO + NRUN = NRUN + NTESTS + END IF + END DO + END DO + END IF + END DO + END IF + END DO + END DO +* * Print a summary of the results. * CALL ALASUM( PATH, NOUT, NFAIL, NRUN, NERRS ) * - 9999 FORMAT( 'M=', I5, ', N=', I5, ', MB1=', I5, - $ ', NB1=', I5, ', NB2=', I5,' test(', I2, ')=', G12.5 ) + 9999 FORMAT( 'ZUNGTSQR and ZUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) + 9998 FORMAT( 'ZUNGTSQR_ROW and ZUNHR_COL: M=', I5, ', N=', I5, + $ ', MB1=', I5, ', NB1=', I5, ', NB2=', I5, + $ ' test(', I2, ')=', G12.5 ) RETURN * * End of ZCHKUNHR_COL diff --git a/lapack-netlib/TESTING/LIN/zdrvgex.f b/lapack-netlib/TESTING/LIN/zdrvgex.f index cdfa10727..1b784d31b 100644 --- a/lapack-netlib/TESTING/LIN/zdrvgex.f +++ b/lapack-netlib/TESTING/LIN/zdrvgex.f @@ -707,9 +707,10 @@ CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA ) IF( .NOT.PREFAC ) - $ CALL ZLASET( 'Full', N, N, ZERO, ZERO, AFAC, - $ LDA ) - CALL ZLASET( 'Full', N, NRHS, ZERO, ZERO, X, LDA ) + $ CALL ZLASET( 'Full', N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN * * Equilibrate the matrix if FACT = 'F' and diff --git a/lapack-netlib/TESTING/LIN/zdrvhex.f b/lapack-netlib/TESTING/LIN/zdrvhex.f index 3c0dfbfe4..527114508 100644 --- a/lapack-netlib/TESTING/LIN/zdrvhex.f +++ b/lapack-netlib/TESTING/LIN/zdrvhex.f @@ -599,10 +599,10 @@ * Restore the matrices A and B. * IF( IFACT.EQ.2 ) - $ CALL ZLASET( UPLO, N, N, CMPLX( ZERO ), - $ CMPLX( ZERO ), AFAC, LDA ) - CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ), - $ CMPLX( ZERO ), X, LDA ) + $ CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) * * Solve the system and compute the condition number * and error bounds using ZHESVXX. diff --git a/lapack-netlib/TESTING/LIN/zdrvpox.f b/lapack-netlib/TESTING/LIN/zdrvpox.f index 260d8c1f2..0bc2c89d8 100644 --- a/lapack-netlib/TESTING/LIN/zdrvpox.f +++ b/lapack-netlib/TESTING/LIN/zdrvpox.f @@ -611,10 +611,10 @@ CALL ZLACPY( 'Full', N, NRHS, BSAV, LDA, B, LDA ) IF( .NOT.PREFAC ) - $ CALL ZLASET( UPLO, N, N, CMPLX( ZERO ), - $ CMPLX( ZERO ), AFAC, LDA ) - CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ), - $ CMPLX( ZERO ), X, LDA ) + $ CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) IF( IEQUED.GT.1 .AND. N.GT.0 ) THEN * * Equilibrate the matrix if FACT='F' and diff --git a/lapack-netlib/TESTING/LIN/zdrvrfp.f b/lapack-netlib/TESTING/LIN/zdrvrfp.f index c7be7da03..b299a487b 100644 --- a/lapack-netlib/TESTING/LIN/zdrvrfp.f +++ b/lapack-netlib/TESTING/LIN/zdrvrfp.f @@ -450,7 +450,7 @@ * CALL ZPOTRI( UPLO, N, A, LDA, INFO ) - IF ( N .NE. 0 ) THEN + IF ( N .NE. 0 ) THEN * * Compute the 1-norm condition number of A. * diff --git a/lapack-netlib/TESTING/LIN/zdrvsyx.f b/lapack-netlib/TESTING/LIN/zdrvsyx.f index 9431cd692..e4556f150 100644 --- a/lapack-netlib/TESTING/LIN/zdrvsyx.f +++ b/lapack-netlib/TESTING/LIN/zdrvsyx.f @@ -605,10 +605,10 @@ * Restore the matrices A and B. * IF( IFACT.EQ.2 ) - $ CALL ZLASET( UPLO, N, N, CMPLX( ZERO ), - $ CMPLX( ZERO ), AFAC, LDA ) - CALL ZLASET( 'Full', N, NRHS, CMPLX( ZERO ), - $ CMPLX( ZERO ), X, LDA ) + $ CALL ZLASET( UPLO, N, N, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), AFAC, LDA ) + CALL ZLASET( 'Full', N, NRHS, DCMPLX( ZERO ), + $ DCMPLX( ZERO ), X, LDA ) * * Solve the system and compute the condition number * and error bounds using ZSYSVXX. diff --git a/lapack-netlib/TESTING/LIN/zerrvxx.f b/lapack-netlib/TESTING/LIN/zerrvxx.f index 9dc008215..bdaf44d8a 100644 --- a/lapack-netlib/TESTING/LIN/zerrvxx.f +++ b/lapack-netlib/TESTING/LIN/zerrvxx.f @@ -1166,7 +1166,7 @@ $ 2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N, $ ERR_BNDS_C, NPARAMS, PARAMS, W, RW, INFO ) CALL CHKXER( 'ZSYSVXX', INFOT, NOUT, LERR, OK ) - INFOT = 13 + INFOT = 13 EQ = 'N' CALL ZSYSVXX( 'N', 'U', 2, 0, A, 2, AF, 2, IP, EQ, R, B, 1, X, $ 2, RCOND, RPVGRW, BERR, N_ERR_BNDS, ERR_BNDS_N, diff --git a/lapack-netlib/TESTING/LIN/zunhr_col01.f b/lapack-netlib/TESTING/LIN/zunhr_col01.f index 9fb3bf352..b7590a8ea 100644 --- a/lapack-netlib/TESTING/LIN/zunhr_col01.f +++ b/lapack-netlib/TESTING/LIN/zunhr_col01.f @@ -21,8 +21,8 @@ *> *> \verbatim *> -*> ZUNHR_COL01 tests ZUNHR_COL using ZLATSQR, ZGEMQRT and ZUNGTSQR. -*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part ZGEMQR), ZUNGTSQR +*> ZUNHR_COL01 tests ZUNGTSQR and ZUNHR_COL using ZLATSQR, ZGEMQRT. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR) *> have to be tested before this test. *> *> \endverbatim @@ -62,14 +62,46 @@ *> \verbatim *> RESULT is DOUBLE PRECISION array, dimension (6) *> Results of each of the six tests below. -*> ( C is a M-by-N random matrix, D is a N-by-M random matrix ) *> -*> RESULT(1) = | A - Q * R | / (eps * m * |A|) -*> RESULT(2) = | I - (Q**H) * Q | / (eps * m ) -*> RESULT(3) = | Q * C - Q * C | / (eps * m * |C|) -*> RESULT(4) = | (Q**H) * C - (Q**H) * C | / (eps * m * |C|) -*> RESULT(5) = | (D * Q) - D * Q | / (eps * m * |D|) -*> RESULT(6) = | D * (Q**H) - D * (Q**H) | / (eps * m * |D|) +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using ZGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using ZGEMM. *> \endverbatim * * Authors: @@ -80,18 +112,15 @@ *> \author Univ. of Colorado Denver *> \author NAG Ltd. * -*> \date November 2019 -* *> \ingroup complex16_lin * * ===================================================================== SUBROUTINE ZUNHR_COL01( M, N, MB1, NB1, NB2, RESULT ) IMPLICIT NONE * -* -- LAPACK test routine (version 3.9.0) -- +* -- LAPACK test routine -- * -- LAPACK is a software package provided by Univ. of Tennessee, -- * -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- -* November 2019 * * .. Scalar Arguments .. INTEGER M, N, MB1, NB1, NB2 @@ -102,7 +131,7 @@ * * .. * .. Local allocatable arrays - COMPLEX*16, ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + COMPLEX*16 , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), $ C(:,:), CF(:,:), D(:,:), DF(:,:) DOUBLE PRECISION, ALLOCATABLE :: RWORK(:) @@ -218,7 +247,7 @@ * Copy the factor R into the array R. * SRNAMT = 'ZLACPY' - CALL ZLACPY( 'U', M, N, AF, M, R, M ) + CALL ZLACPY( 'U', N, N, AF, M, R, M ) * * Reconstruct the orthogonal matrix Q. * @@ -240,7 +269,7 @@ * matrix S. * SRNAMT = 'ZLACPY' - CALL ZLACPY( 'U', M, N, R, M, AF, M ) + CALL ZLACPY( 'U', N, N, R, M, AF, M ) * DO I = 1, N IF( DIAG( I ).EQ.-CONE ) THEN diff --git a/lapack-netlib/TESTING/LIN/zunhr_col02.f b/lapack-netlib/TESTING/LIN/zunhr_col02.f new file mode 100644 index 000000000..c6e7f80cd --- /dev/null +++ b/lapack-netlib/TESTING/LIN/zunhr_col02.f @@ -0,0 +1,381 @@ +*> \brief \b ZUNHR_COL02 +* +* =========== DOCUMENTATION =========== +* +* Online html documentation available at +* http://www.netlib.org/lapack/explore-html/ +* +* Definition: +* =========== +* +* SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) +* +* .. Scalar Arguments .. +* INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. +* DOUBLE PRECISION RESULT(6) +* +* +*> \par Purpose: +* ============= +*> +*> \verbatim +*> +*> ZUNHR_COL02 tests ZUNGTSQR_ROW and ZUNHR_COL inside ZGETSQRHRT +*> (which calls ZLATSQR, ZUNGTSQR_ROW and ZUNHR_COL) using ZGEMQRT. +*> Therefore, ZLATSQR (part of ZGEQR), ZGEMQRT (part of ZGEMQR) +*> have to be tested before this test. +*> +*> \endverbatim +* +* Arguments: +* ========== +* +*> \param[in] M +*> \verbatim +*> M is INTEGER +*> Number of rows in test matrix. +*> \endverbatim +*> \param[in] N +*> \verbatim +*> N is INTEGER +*> Number of columns in test matrix. +*> \endverbatim +*> \param[in] MB1 +*> \verbatim +*> MB1 is INTEGER +*> Number of row in row block in an input test matrix. +*> \endverbatim +*> +*> \param[in] NB1 +*> \verbatim +*> NB1 is INTEGER +*> Number of columns in column block an input test matrix. +*> \endverbatim +*> +*> \param[in] NB2 +*> \verbatim +*> NB2 is INTEGER +*> Number of columns in column block in an output test matrix. +*> \endverbatim +*> +*> \param[out] RESULT +*> \verbatim +*> RESULT is DOUBLE PRECISION array, dimension (6) +*> Results of each of the six tests below. +*> +*> A is a m-by-n test input matrix to be factored. +*> so that A = Q_gr * ( R ) +*> ( 0 ), +*> +*> Q_qr is an implicit m-by-m unitary Q matrix, the result +*> of factorization in blocked WY-representation, +*> stored in ZGEQRT output format. +*> +*> R is a n-by-n upper-triangular matrix, +*> +*> 0 is a (m-n)-by-n zero matrix, +*> +*> Q is an explicit m-by-m unitary matrix Q = Q_gr * I +*> +*> C is an m-by-n random matrix, +*> +*> D is an n-by-m random matrix. +*> +*> The six tests are: +*> +*> RESULT(1) = |R - (Q**H) * A| / ( eps * m * |A| ) +*> is equivalent to test for | A - Q * R | / (eps * m * |A|), +*> +*> RESULT(2) = |I - (Q**H) * Q| / ( eps * m ), +*> +*> RESULT(3) = | Q_qr * C - Q * C | / (eps * m * |C|), +*> +*> RESULT(4) = | (Q_gr**H) * C - (Q**H) * C | / (eps * m * |C|) +*> +*> RESULT(5) = | D * Q_qr - D * Q | / (eps * m * |D|) +*> +*> RESULT(6) = | D * (Q_qr**H) - D * (Q**H) | / (eps * m * |D|), +*> +*> where: +*> Q_qr * C, (Q_gr**H) * C, D * Q_qr, D * (Q_qr**H) are +*> computed using ZGEMQRT, +*> +*> Q * C, (Q**H) * C, D * Q, D * (Q**H) are +*> computed using ZGEMM. +*> \endverbatim +* +* Authors: +* ======== +* +*> \author Univ. of Tennessee +*> \author Univ. of California Berkeley +*> \author Univ. of Colorado Denver +*> \author NAG Ltd. +* +*> \ingroup complex16_lin +* +* ===================================================================== + SUBROUTINE ZUNHR_COL02( M, N, MB1, NB1, NB2, RESULT ) + IMPLICIT NONE +* +* -- LAPACK test routine -- +* -- LAPACK is a software package provided by Univ. of Tennessee, -- +* -- Univ. of California Berkeley, Univ. of Colorado Denver and NAG Ltd..-- +* +* .. Scalar Arguments .. + INTEGER M, N, MB1, NB1, NB2 +* .. Return values .. + DOUBLE PRECISION RESULT(6) +* +* ===================================================================== +* +* .. +* .. Local allocatable arrays + COMPLEX*16 , ALLOCATABLE :: A(:,:), AF(:,:), Q(:,:), R(:,:), + $ WORK( : ), T1(:,:), T2(:,:), DIAG(:), + $ C(:,:), CF(:,:), D(:,:), DF(:,:) + DOUBLE PRECISION, ALLOCATABLE :: RWORK(:) +* +* .. Parameters .. + DOUBLE PRECISION ZERO + PARAMETER ( ZERO = 0.0D+0 ) + COMPLEX*16 CONE, CZERO + PARAMETER ( CONE = ( 1.0D+0, 0.0D+0 ), + $ CZERO = ( 0.0D+0, 0.0D+0 ) ) +* .. +* .. Local Scalars .. + LOGICAL TESTZEROS + INTEGER INFO, J, K, L, LWORK, NB2_UB, NRB + DOUBLE PRECISION ANORM, EPS, RESID, CNORM, DNORM +* .. +* .. Local Arrays .. + INTEGER ISEED( 4 ) + COMPLEX*16 WORKQUERY( 1 ) +* .. +* .. External Functions .. + DOUBLE PRECISION DLAMCH, ZLANGE, ZLANSY + EXTERNAL DLAMCH, ZLANGE, ZLANSY +* .. +* .. External Subroutines .. + EXTERNAL ZLACPY, ZLARNV, ZLASET, ZGETSQRHRT, + $ ZSCAL, ZGEMM, ZGEMQRT, ZHERK +* .. +* .. Intrinsic Functions .. + INTRINSIC CEILING, DBLE, MAX, MIN +* .. +* .. Scalars in Common .. + CHARACTER(LEN=32) SRNAMT +* .. +* .. Common blocks .. + COMMON / SRMNAMC / SRNAMT +* .. +* .. Data statements .. + DATA ISEED / 1988, 1989, 1990, 1991 / +* +* TEST MATRICES WITH HALF OF MATRIX BEING ZEROS +* + TESTZEROS = .FALSE. +* + EPS = DLAMCH( 'Epsilon' ) + K = MIN( M, N ) + L = MAX( M, N, 1) +* +* Dynamically allocate local arrays +* + ALLOCATE ( A(M,N), AF(M,N), Q(L,L), R(M,L), RWORK(L), + $ C(M,N), CF(M,N), + $ D(N,M), DF(N,M) ) +* +* Put random numbers into A and copy to AF +* + DO J = 1, N + CALL ZLARNV( 2, ISEED, M, A( 1, J ) ) + END DO + IF( TESTZEROS ) THEN + IF( M.GE.4 ) THEN + DO J = 1, N + CALL ZLARNV( 2, ISEED, M/2, A( M/4, J ) ) + END DO + END IF + END IF + CALL ZLACPY( 'Full', M, N, A, M, AF, M ) +* +* Number of row blocks in ZLATSQR +* + NRB = MAX( 1, CEILING( DBLE( M - N ) / DBLE( MB1 - N ) ) ) +* + ALLOCATE ( T1( NB1, N * NRB ) ) + ALLOCATE ( T2( NB2, N ) ) + ALLOCATE ( DIAG( N ) ) +* +* Begin determine LWORK for the array WORK and allocate memory. +* +* ZGEMQRT requires NB2 to be bounded by N. +* + NB2_UB = MIN( NB2, N) +* +* + CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORKQUERY, -1, INFO ) +* + LWORK = INT( WORKQUERY( 1 ) ) +* +* In ZGEMQRT, WORK is N*NB2_UB if SIDE = 'L', +* or M*NB2_UB if SIDE = 'R'. +* + LWORK = MAX( LWORK, NB2_UB * N, NB2_UB * M ) +* + ALLOCATE ( WORK( LWORK ) ) +* +* End allocate memory for WORK. +* +* +* Begin Householder reconstruction routines +* +* Factor the matrix A in the array AF. +* + SRNAMT = 'ZGETSQRHRT' + CALL ZGETSQRHRT( M, N, MB1, NB1, NB2, AF, M, T2, NB2, + $ WORK, LWORK, INFO ) +* +* End Householder reconstruction routines. +* +* +* Generate the m-by-m matrix Q +* + CALL ZLASET( 'Full', M, M, CZERO, CONE, Q, M ) +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'N', M, M, K, NB2_UB, AF, M, T2, NB2, Q, M, + $ WORK, INFO ) +* +* Copy R +* + CALL ZLASET( 'Full', M, N, CZERO, CZERO, R, M ) +* + CALL ZLACPY( 'Upper', M, N, AF, M, R, M ) +* +* TEST 1 +* Compute |R - (Q**T)*A| / ( eps * m * |A| ) and store in RESULT(1) +* + CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, A, M, CONE, R, M ) +* + ANORM = ZLANGE( '1', M, N, A, M, RWORK ) + RESID = ZLANGE( '1', M, N, R, M, RWORK ) + IF( ANORM.GT.ZERO ) THEN + RESULT( 1 ) = RESID / ( EPS * MAX( 1, M ) * ANORM ) + ELSE + RESULT( 1 ) = ZERO + END IF +* +* TEST 2 +* Compute |I - (Q**T)*Q| / ( eps * m ) and store in RESULT(2) +* + CALL ZLASET( 'Full', M, M, CZERO, CONE, R, M ) + CALL ZHERK( 'U', 'C', M, M, -CONE, Q, M, CONE, R, M ) + RESID = ZLANSY( '1', 'Upper', M, R, M, RWORK ) + RESULT( 2 ) = RESID / ( EPS * MAX( 1, M ) ) +* +* Generate random m-by-n matrix C +* + DO J = 1, N + CALL ZLARNV( 2, ISEED, M, C( 1, J ) ) + END DO + CNORM = ZLANGE( '1', M, N, C, M, RWORK ) + CALL ZLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as Q*C = CF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'N', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 3 +* Compute |CF - Q*C| / ( eps * m * |C| ) +* + CALL ZGEMM( 'N', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = ZLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 3 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 3 ) = ZERO + END IF +* +* Copy C into CF again +* + CALL ZLACPY( 'Full', M, N, C, M, CF, M ) +* +* Apply Q to C as (Q**T)*C = CF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'L', 'C', M, N, K, NB2_UB, AF, M, T2, NB2, CF, M, + $ WORK, INFO ) +* +* TEST 4 +* Compute |CF - (Q**T)*C| / ( eps * m * |C|) +* + CALL ZGEMM( 'C', 'N', M, N, M, -CONE, Q, M, C, M, CONE, CF, M ) + RESID = ZLANGE( '1', M, N, CF, M, RWORK ) + IF( CNORM.GT.ZERO ) THEN + RESULT( 4 ) = RESID / ( EPS * MAX( 1, M ) * CNORM ) + ELSE + RESULT( 4 ) = ZERO + END IF +* +* Generate random n-by-m matrix D and a copy DF +* + DO J = 1, M + CALL ZLARNV( 2, ISEED, N, D( 1, J ) ) + END DO + DNORM = ZLANGE( '1', N, M, D, N, RWORK ) + CALL ZLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*Q = DF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'R', 'N', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 5 +* Compute |DF - D*Q| / ( eps * m * |D| ) +* + CALL ZGEMM( 'N', 'N', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = ZLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 5 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 5 ) = ZERO + END IF +* +* Copy D into DF again +* + CALL ZLACPY( 'Full', N, M, D, N, DF, N ) +* +* Apply Q to D as D*QT = DF +* + SRNAMT = 'ZGEMQRT' + CALL ZGEMQRT( 'R', 'C', N, M, K, NB2_UB, AF, M, T2, NB2, DF, N, + $ WORK, INFO ) +* +* TEST 6 +* Compute |DF - D*(Q**T)| / ( eps * m * |D| ) +* + CALL ZGEMM( 'N', 'C', N, M, M, -CONE, D, N, Q, M, CONE, DF, N ) + RESID = ZLANGE( '1', N, M, DF, N, RWORK ) + IF( DNORM.GT.ZERO ) THEN + RESULT( 6 ) = RESID / ( EPS * MAX( 1, M ) * DNORM ) + ELSE + RESULT( 6 ) = ZERO + END IF +* +* Deallocate all arrays +* + DEALLOCATE ( A, AF, Q, R, RWORK, WORK, T1, T2, DIAG, + $ C, D, CF, DF ) +* + RETURN +* +* End of ZUNHR_COL02 +* + END diff --git a/param.h b/param.h index c41f75ec9..a35ce69bd 100644 --- a/param.h +++ b/param.h @@ -2466,13 +2466,13 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #define SGEMM_DEFAULT_P 512 #define DGEMM_DEFAULT_P 384 -#define CGEMM_DEFAULT_P 512 +#define CGEMM_DEFAULT_P 512 #define ZGEMM_DEFAULT_P 256 #define SGEMM_DEFAULT_Q 512 #define DGEMM_DEFAULT_Q 512 -#define CGEMM_DEFAULT_Q 1026 -#define ZGEMM_DEFAULT_Q 1026 +#define CGEMM_DEFAULT_Q 384 +#define ZGEMM_DEFAULT_Q 384 #define SGEMM_DEFAULT_R 4096 #define DGEMM_DEFAULT_R 4096